diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py index 0027cc4ba352..ba6c0f9c9679 100644 --- a/apps/android_rpc/tests/android_rpc_test.py +++ b/apps/android_rpc/tests/android_rpc_test.py @@ -56,37 +56,15 @@ def test_rpc_module(): tracker = rpc.connect_tracker(tracker_host, tracker_port) remote = tracker.request(key, priority=0, session_timeout=60) - # Compile the Graph for CPU target - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=64) - s[B].parallel(xi) - s[B].pragma(xo, "parallel_launch_point") - s[B].pragma(xi, "parallel_barrier_when_finish") - f = tvm.build(s, [A, B], target, name="myadd_cpu") - path_dso_cpu = temp.relpath("cpu_lib.so") - f.export_library(path_dso_cpu, fcompile=ndk.create_shared) + mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "myadd")) + sch = tvm.tir.Schedule(mod) + (x,) = sch.get_loops(block=sch.get_block("B")) + xo, xi = sch.split(i, [None, 32]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") - # Execute the portable graph on cpu target - print("Run CPU test ...") - dev = remote.cpu(0) - remote.upload(path_dso_cpu) - f2 = remote.load_module("cpu_lib.so") - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) - time_f = f2.time_evaluator(f2.entry_name, dev, number=10) - cost = time_f(a, b).mean - print("%g secs/op\n" % cost) - np.testing.assert_equal(b.numpy(), a.numpy() + 1) - - # Compile the Graph for OpenCL target if test_opencl: - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=64) - s[B].bind(xi, te.thread_axis("threadIdx.x")) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - # Build the dynamic lib. - # If we don't want to do metal and only use cpu, just set target to be target - f = tvm.build(s, [A, B], tvm.target.Target("opencl", host=target), name="myadd") + f = tvm.build(sch.mod, target=tvm.target.Target("opencl", host=target)) path_dso_cl = temp.relpath("dev_lib_cl.so") f.export_library(path_dso_cl, fcompile=ndk.create_shared) @@ -101,29 +79,6 @@ def test_rpc_module(): print("%g secs/op\n" % cost) np.testing.assert_equal(b.numpy(), a.numpy() + 1) - # Compile the Graph for Vulkan target - if test_vulkan: - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=64) - s[B].bind(xi, te.thread_axis("threadIdx.x")) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - # Build the dynamic lib. - # If we don't want to do metal and only use cpu, just set target to be target - f = tvm.build(s, [A, B], tvm.target.Target("vulkan", host=target), name="myadd") - path_dso_vulkan = temp.relpath("dev_lib_vulkan.so") - f.export_library(path_dso_vulkan, fcompile=ndk.create_shared) - - print("Run GPU(Vulkan Flavor) test ...") - dev = remote.vulkan(0) - remote.upload(path_dso_vulkan) - f1 = remote.load_module("dev_lib_vulkan.so") - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) - time_f = f1.time_evaluator(f1.entry_name, dev, number=10) - cost = time_f(a, b).mean - print("%g secs/op\n" % cost) - np.testing.assert_equal(b.numpy(), a.numpy() + 1) - if __name__ == "__main__": test_rpc_module() diff --git a/apps/ios_rpc/tests/ios_rpc_test.py b/apps/ios_rpc/tests/ios_rpc_test.py index f0c31cd7d268..3e807adf484c 100644 --- a/apps/ios_rpc/tests/ios_rpc_test.py +++ b/apps/ios_rpc/tests/ios_rpc_test.py @@ -50,25 +50,19 @@ def test_rpc_module(host, port, key, mode): A = te.placeholder((n,), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") temp = utils.tempdir() - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=64) - s[B].bind(xi, te.thread_axis("threadIdx.x")) - s[B].bind(xo, te.thread_axis("blockIdx.x")) + mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "myadd")) + sch = tvm.tir.Schedule(mod) + (i,) = sch.get_loops(block=sch.get_block("B")) + i0, i1 = sch.split(i, [None, 32]) + sch.bind(i0, "blockIdx.x") + sch.bind(i1, "threadIdx.x") + # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target - f = tvm.build(s, [A, B], tvm.target.Target("metal", host=target), name="myadd") + f = tvm.build(sch.mod, target=tvm.target.Target("metal", host=target)) path_dso1 = temp.relpath("dev_lib.dylib") f.export_library(path_dso1, fcompile=xcode.create_dylib, arch=arch, sdk=sdk) - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=64) - s[B].parallel(xi) - s[B].pragma(xo, "parallel_launch_point") - s[B].pragma(xi, "parallel_barrier_when_finish") - f = tvm.build(s, [A, B], target, name="myadd_cpu") - path_dso2 = temp.relpath("cpu_lib.dylib") - f.export_library(path_dso2, fcompile=xcode.create_dylib, arch=arch, sdk=sdk) - # connect to the proxy if mode == "tracker": remote = MODES[mode](host, port).request(key) @@ -84,17 +78,6 @@ def test_rpc_module(host, port, key, mode): cost = time_f(a, b).mean print("Metal: %g secs/op" % cost) np.testing.assert_equal(b.numpy(), a.numpy() + 1) - # CPU - dev = remote.cpu(0) - remote.upload(path_dso2) - f2 = remote.load_module("cpu_lib.dylib") - a_np = np.random.uniform(size=1024).astype(A.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) - time_f = f2.time_evaluator(f2.entry_name, dev, number=10) - cost = time_f(a, b).mean - print("CPU: %g secs/op" % cost) - np.testing.assert_equal(b.numpy(), a.numpy() + 1) if __name__ == "__main__": diff --git a/ci/jenkins/generated/arm_jenkinsfile.groovy b/ci/jenkins/generated/arm_jenkinsfile.groovy index 5e48cc65004b..03ea3a028040 100644 --- a/ci/jenkins/generated/arm_jenkinsfile.groovy +++ b/ci/jenkins/generated/arm_jenkinsfile.groovy @@ -60,7 +60,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2025-02-15T10:14:10.162250 +// Generated at 2025-02-15T20:02:41.820729 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // These are set at runtime from data in ci/jenkins/docker-images.yml, update @@ -545,274 +545,3 @@ def build() { } } build() - - - -def shard_run_integration_aarch64_1_of_4(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_arm) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=integration: aarch64', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=0', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_arm) - python_unittest(ci_arm) - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('integration: aarch64 1 of 4') - } -} - -def shard_run_integration_aarch64_2_of_4(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_arm) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=integration: aarch64', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=1', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_arm) - python_unittest(ci_arm) - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('integration: aarch64 2 of 4') - } -} - -def shard_run_integration_aarch64_3_of_4(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_arm) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=integration: aarch64', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=2', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_arm) - python_unittest(ci_arm) - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('integration: aarch64 3 of 4') - } -} - -def shard_run_integration_aarch64_4_of_4(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-arm") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_arm) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=arm', - 'TEST_STEP_NAME=integration: aarch64', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=3', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/arm", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_arm) - python_unittest(ci_arm) - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_aarch64 --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('integration: aarch64 4 of 4') - } -} - - - -def test() { - stage('Test') { - environment { - SKIP_SLOW_TESTS = "${skip_slow_tests}" - } - parallel( - 'integration: aarch64 1 of 4': { - try { - shard_run_integration_aarch64_1_of_4('ARM-GRAVITON3-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_integration_aarch64_1_of_4('ARM-GRAVITON3') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'integration: aarch64 2 of 4': { - try { - shard_run_integration_aarch64_2_of_4('ARM-GRAVITON3-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_integration_aarch64_2_of_4('ARM-GRAVITON3') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'integration: aarch64 3 of 4': { - try { - shard_run_integration_aarch64_3_of_4('ARM-GRAVITON3-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_integration_aarch64_3_of_4('ARM-GRAVITON3') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'integration: aarch64 4 of 4': { - try { - shard_run_integration_aarch64_4_of_4('ARM-GRAVITON3-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_integration_aarch64_4_of_4('ARM-GRAVITON3') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - ) - } -} -test() diff --git a/ci/jenkins/generated/cpu_jenkinsfile.groovy b/ci/jenkins/generated/cpu_jenkinsfile.groovy index b54fdf51ca3c..627bb85862f3 100644 --- a/ci/jenkins/generated/cpu_jenkinsfile.groovy +++ b/ci/jenkins/generated/cpu_jenkinsfile.groovy @@ -60,7 +60,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2025-02-15T10:14:10.181874 +// Generated at 2025-02-15T19:40:24.687837 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // These are set at runtime from data in ci/jenkins/docker-images.yml, update @@ -553,158 +553,21 @@ build() -def shard_run_integration_CPU_1_of_4(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_cpu) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=cpu', - 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=0', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('integration: CPU 1 of 4') - } -} - -def shard_run_integration_CPU_2_of_4(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_cpu) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=cpu', - 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=1', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('integration: CPU 2 of 4') - } -} - -def shard_run_integration_CPU_3_of_4(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_cpu) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=cpu', - 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=2', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('integration: CPU 3 of 4') - } -} -def shard_run_integration_CPU_4_of_4(node_type) { +def shard_run_unittest_CPU_1_of_2(node_type) { echo 'Begin running on node_type ' + node_type if (!skip_ci && is_docs_only_build != 1) { node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/integration-python-cpu") { + ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/ut-python-cpu") { // NOTE: if exception happens, it will be caught outside init_git() docker_init(ci_cpu) timeout(time: max_time, unit: 'MINUTES') { withEnv([ 'PLATFORM=cpu', - 'TEST_STEP_NAME=integration: CPU', - 'TVM_NUM_SHARDS=4', - 'TVM_SHARD_INDEX=3', + 'TEST_STEP_NAME=unittest: CPU', + 'TVM_NUM_SHARDS=2', + 'TVM_SHARD_INDEX=0', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu", @@ -712,16 +575,14 @@ def shard_run_integration_CPU_4_of_4(node_type) { ) ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) + cpp_unittest(ci_cpu) + python_unittest(ci_cpu) }) } // only run upload if things are successful try { sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/integration_CPU --items build/pytest-results", + script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/unittest_CPU --items build/pytest-results", label: 'Upload JUnits to S3', ) @@ -733,13 +594,11 @@ def shard_run_integration_CPU_4_of_4(node_type) { } echo 'End running on node_type ' + node_type } else { - Utils.markStageSkippedForConditional('integration: CPU 4 of 4') + Utils.markStageSkippedForConditional('unittest: CPU 1 of 2') } } - - -def shard_run_unittest_CPU_1_of_1(node_type) { +def shard_run_unittest_CPU_2_of_2(node_type) { echo 'Begin running on node_type ' + node_type if (!skip_ci && is_docs_only_build != 1) { node(node_type) { @@ -751,8 +610,8 @@ def shard_run_unittest_CPU_1_of_1(node_type) { withEnv([ 'PLATFORM=cpu', 'TEST_STEP_NAME=unittest: CPU', - 'TVM_NUM_SHARDS=1', - 'TVM_SHARD_INDEX=0', + 'TVM_NUM_SHARDS=2', + 'TVM_SHARD_INDEX=1', "SKIP_SLOW_TESTS=${skip_slow_tests}"], { sh( script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/cpu", @@ -779,7 +638,7 @@ def shard_run_unittest_CPU_1_of_1(node_type) { } echo 'End running on node_type ' + node_type } else { - Utils.markStageSkippedForConditional('unittest: CPU 1 of 1') + Utils.markStageSkippedForConditional('unittest: CPU 2 of 2') } } @@ -790,60 +649,9 @@ def test() { SKIP_SLOW_TESTS = "${skip_slow_tests}" } parallel( - 'integration: CPU 1 of 4': { - try { - shard_run_integration_CPU_1_of_4('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_integration_CPU_1_of_4('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'integration: CPU 2 of 4': { - try { - shard_run_integration_CPU_2_of_4('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_integration_CPU_2_of_4('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'integration: CPU 3 of 4': { - try { - shard_run_integration_CPU_3_of_4('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_integration_CPU_3_of_4('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'integration: CPU 4 of 4': { + 'unittest: CPU 1 of 2': { try { - shard_run_integration_CPU_4_of_4('CPU-SMALL-SPOT') + shard_run_unittest_CPU_1_of_2('CPU-SMALL-SPOT') } catch (Throwable ex) { if (is_last_build()) { // retry if at last build @@ -851,16 +659,16 @@ def test() { // and try again via on demand node echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' currentBuild.result = 'SUCCESS' - shard_run_integration_CPU_4_of_4('CPU-SMALL') + shard_run_unittest_CPU_1_of_2('CPU-SMALL') } else { echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' throw ex } } }, - 'unittest: CPU 1 of 1': { + 'unittest: CPU 2 of 2': { try { - shard_run_unittest_CPU_1_of_1('CPU-SMALL-SPOT') + shard_run_unittest_CPU_2_of_2('CPU-SMALL-SPOT') } catch (Throwable ex) { if (is_last_build()) { // retry if at last build @@ -868,7 +676,7 @@ def test() { // and try again via on demand node echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' currentBuild.result = 'SUCCESS' - shard_run_unittest_CPU_1_of_1('CPU-SMALL') + shard_run_unittest_CPU_2_of_2('CPU-SMALL') } else { echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' throw ex diff --git a/ci/jenkins/generated/hexagon_jenkinsfile.groovy b/ci/jenkins/generated/hexagon_jenkinsfile.groovy index da20f33bbb3d..a9014337a74a 100644 --- a/ci/jenkins/generated/hexagon_jenkinsfile.groovy +++ b/ci/jenkins/generated/hexagon_jenkinsfile.groovy @@ -60,7 +60,7 @@ // 'python3 jenkins/generate.py' // Note: This timestamp is here to ensure that updates to the Jenkinsfile are // always rebased on main before merging: -// Generated at 2025-02-15T10:14:10.056677 +// Generated at 2025-02-15T19:31:36.031215 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils // These are set at runtime from data in ci/jenkins/docker-images.yml, update @@ -552,519 +552,12 @@ build() - -def shard_run_test_Hexagon_1_of_8(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_hexagon) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=0', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_hexagon) - cpp_unittest(ci_hexagon) - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('test: Hexagon 1 of 8') - } -} - -def shard_run_test_Hexagon_2_of_8(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_hexagon) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=1', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_hexagon) - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('test: Hexagon 2 of 8') - } -} - -def shard_run_test_Hexagon_3_of_8(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_hexagon) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=2', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_hexagon) - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('test: Hexagon 3 of 8') - } -} - -def shard_run_test_Hexagon_4_of_8(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_hexagon) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=3', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_hexagon) - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('test: Hexagon 4 of 8') - } -} - -def shard_run_test_Hexagon_5_of_8(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_hexagon) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=4', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_hexagon) - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('test: Hexagon 5 of 8') - } -} - -def shard_run_test_Hexagon_6_of_8(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_hexagon) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=5', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_hexagon) - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('test: Hexagon 6 of 8') - } -} - -def shard_run_test_Hexagon_7_of_8(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_hexagon) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=6', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_hexagon) - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('test: Hexagon 7 of 8') - } -} - -def shard_run_test_Hexagon_8_of_8(node_type) { - echo 'Begin running on node_type ' + node_type - if (!skip_ci && is_docs_only_build != 1) { - node(node_type) { - ws("workspace/exec_${env.EXECUTOR_NUMBER}/tvm/test-hexagon") { - // NOTE: if exception happens, it will be caught outside - init_git() - docker_init(ci_hexagon) - timeout(time: max_time, unit: 'MINUTES') { - withEnv([ - 'PLATFORM=hexagon', - 'TEST_STEP_NAME=test: Hexagon', - 'TVM_NUM_SHARDS=8', - 'TVM_SHARD_INDEX=7', - "SKIP_SLOW_TESTS=${skip_slow_tests}"], { - sh( - script: "./${jenkins_scripts_root}/s3.py --action download --bucket ${s3_bucket} --prefix ${s3_prefix}/hexagon", - label: 'Download artifacts from S3', - ) - - ci_setup(ci_hexagon) - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', - ) - }) - } - // only run upload if things are successful - try { - sh( - script: "./${jenkins_scripts_root}/s3.py --action upload --bucket ${s3_bucket} --prefix ${s3_prefix}/pytest-results/test_Hexagon --items build/pytest-results", - label: 'Upload JUnits to S3', - ) - - junit 'build/pytest-results/*.xml' - } catch (Exception e) { - echo 'Exception during JUnit upload: ' + e.toString() - } - } - } - echo 'End running on node_type ' + node_type - } else { - Utils.markStageSkippedForConditional('test: Hexagon 8 of 8') - } -} - - def test() { stage('Test') { environment { SKIP_SLOW_TESTS = "${skip_slow_tests}" } parallel( - 'test: Hexagon 1 of 8': { - try { - shard_run_test_Hexagon_1_of_8('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_test_Hexagon_1_of_8('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'test: Hexagon 2 of 8': { - try { - shard_run_test_Hexagon_2_of_8('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_test_Hexagon_2_of_8('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'test: Hexagon 3 of 8': { - try { - shard_run_test_Hexagon_3_of_8('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_test_Hexagon_3_of_8('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'test: Hexagon 4 of 8': { - try { - shard_run_test_Hexagon_4_of_8('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_test_Hexagon_4_of_8('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'test: Hexagon 5 of 8': { - try { - shard_run_test_Hexagon_5_of_8('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_test_Hexagon_5_of_8('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'test: Hexagon 6 of 8': { - try { - shard_run_test_Hexagon_6_of_8('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_test_Hexagon_6_of_8('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'test: Hexagon 7 of 8': { - try { - shard_run_test_Hexagon_7_of_8('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_test_Hexagon_7_of_8('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, - 'test: Hexagon 8 of 8': { - try { - shard_run_test_Hexagon_8_of_8('CPU-SMALL-SPOT') - } catch (Throwable ex) { - if (is_last_build()) { - // retry if at last build - // mark the current stage as success - // and try again via on demand node - echo 'Exception during SPOT run ' + ex.toString() + ' retry on-demand' - currentBuild.result = 'SUCCESS' - shard_run_test_Hexagon_8_of_8('CPU-SMALL') - } else { - echo 'Exception during SPOT run ' + ex.toString() + ' exit since it is not last build' - throw ex - } - } - }, ) } } diff --git a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2 b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2 index aa999408a7e2..0781bc92dbe5 100644 --- a/ci/jenkins/templates/arm_jenkinsfile.groovy.j2 +++ b/ci/jenkins/templates/arm_jenkinsfile.groovy.j2 @@ -33,25 +33,3 @@ make_cpp_tests(ci_arm, 'build') {{ m.upload_artifacts(tag='arm', filenames=tvm_lib + cpptest) }} {% endcall %} - -{% set test_method_names = [] %} - -{% call(shard_index, num_shards) m.sharded_test_step( - name="integration: aarch64", - num_shards=4, - ws="tvm/ut-python-arm", - platform="arm", - docker_image="ci_arm", - test_method_names=test_method_names, -) %} - {{ m.download_artifacts(tag='arm') }} - ci_setup(ci_arm) - python_unittest(ci_arm) - sh ( - script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) -{% endcall %} - - -{{ m.invoke_tests(node="ARM-GRAVITON3", test_method_names=test_method_names) -}} diff --git a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2 b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2 index e34132c94111..c84b0c48a29f 100644 --- a/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2 +++ b/ci/jenkins/templates/cpu_jenkinsfile.groovy.j2 @@ -40,27 +40,12 @@ {% set test_method_names = [] %} -{% call(shard_index, num_shards) m.sharded_test_step( - name="integration: CPU", - num_shards=4, - ws="tvm/integration-python-cpu", - platform="cpu", - docker_image="ci_cpu", - test_method_names=test_method_names, -) %} - {{ m.download_artifacts(tag='cpu') }} - ci_setup(ci_cpu) - sh ( - script: "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh", - label: 'Run CPU integration tests', - ) -{% endcall %} {% call(shard_index, num_shards) m.sharded_test_step( name="unittest: CPU", ws="tvm/ut-python-cpu", platform="cpu", - num_shards=1, + num_shards=2, docker_image="ci_cpu", test_method_names=test_method_names, ) %} diff --git a/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2 b/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2 index 91d3ce9ece42..b4177b332987 100644 --- a/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2 +++ b/ci/jenkins/templates/hexagon_jenkinsfile.groovy.j2 @@ -41,23 +41,4 @@ {% set test_method_names = [] %} -{% call(shard_index, num_shards) m.sharded_test_step( - name="test: Hexagon", - ws="tvm/test-hexagon", - platform="hexagon", - docker_image="ci_hexagon", - test_method_names=test_method_names, - num_shards=8, -) %} - {{ m.download_artifacts(tag='hexagon') }} - ci_setup(ci_hexagon) - {% if shard_index == 1 %} - cpp_unittest(ci_hexagon) - {% endif %} - sh ( - script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_python_hexagon.sh", - label: 'Run Hexagon tests', - ) -{% endcall %} - {{ m.invoke_tests(node="CPU-SMALL", test_method_names=test_method_names) -}} diff --git a/docker/Dockerfile.ci_wasm b/docker/Dockerfile.ci_wasm index 6860c51d7277..83a9f0e9f0e8 100644 --- a/docker/Dockerfile.ci_wasm +++ b/docker/Dockerfile.ci_wasm @@ -64,8 +64,8 @@ RUN bash /install/ubuntu_install_emscripten.sh ENV EMSDK=/emsdk ENV PATH=${PATH}:${EMSDK}:${EMSDK}/upstream/emscripten ENV EMSCRIPTEN=${EMSDK}/upstream/emscripten -ENV BINARYEN=${EMSDK}/upstream -ENV LLVM=${EMSDK}/upstream/bin +ENV EM_BINARYEN_ROOT=${EMSDK}/upstream +ENV EM_LLVM_ROOT=${EMSDK}/upstream/bin # sccache COPY install/ubuntu_install_sccache.sh /install/ubuntu_install_sccache.sh diff --git a/docs/how_to/tutorials/cross_compilation_and_rpc.py b/docs/how_to/tutorials/cross_compilation_and_rpc.py index c7e302693de7..81c73fd051ef 100644 --- a/docs/how_to/tutorials/cross_compilation_and_rpc.py +++ b/docs/how_to/tutorials/cross_compilation_and_rpc.py @@ -104,7 +104,7 @@ n = tvm.runtime.convert(1024) A = te.placeholder((n,), name="A") B = te.compute((n,), lambda i: A[i] + 1.0, name="B") -s = te.create_schedule(B.op) +mod = tvm.IRModule.from_expr(te.create_prim_func([A, B]).with_attr("global_symbol", "add_one")) ###################################################################### # Then we cross compile the kernel. @@ -119,7 +119,7 @@ else: target = "llvm -mtriple=armv7l-linux-gnueabihf" -func = tvm.build(s, [A, B], target=target, name="add_one") +func = tvm.build(mod, target=target, name="add_one") # save the lib at a local temp folder temp = utils.tempdir() path = temp.relpath("lib.tar") @@ -231,11 +231,13 @@ def run_opencl(): target = tvm.target.Target("opencl", host="llvm -mtriple=aarch64-linux-gnu") # create schedule for the above "add one" compute declaration - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=32) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[B].bind(xi, te.thread_axis("threadIdx.x")) - func = tvm.build(s, [A, B], target=target) + mod = tvm.IRModule.from_expr(te.create_prim_func([A, B])) + sch = tvm.tir.Schedule(mod) + (x,) = sch.get_loops(block=sch.get_block("B")) + xo, xi = sch.split(i, [None, 32]) + sch.bind(x, "blockIdx.x") + sch.bind(x, "threadIdx.x") + func = tvm.build(sch.mod, target=target) remote = rpc.connect(opencl_device_host, opencl_device_port) diff --git a/docs/reference/api/python/contrib.rst b/docs/reference/api/python/contrib.rst index 0eb3024c2d08..e85d3bec5caf 100644 --- a/docs/reference/api/python/contrib.rst +++ b/docs/reference/api/python/contrib.rst @@ -104,11 +104,6 @@ tvm.contrib.rocm .. automodule:: tvm.contrib.rocm :members: -tvm.contrib.sparse -~~~~~~~~~~~~~~~~~~ -.. automodule:: tvm.contrib.sparse - :members: - tvm.contrib.spirv ~~~~~~~~~~~~~~~~~ diff --git a/docs/reference/api/python/te.rst b/docs/reference/api/python/te.rst index 83e0042db1b9..363dae675d84 100644 --- a/docs/reference/api/python/te.rst +++ b/docs/reference/api/python/te.rst @@ -23,11 +23,3 @@ tvm.te :members: :imported-members: :autosummary: - - -tvm.te.hybrid -------------- -.. automodule:: tvm.te.hybrid - :members: - :imported-members: - :autosummary: diff --git a/golang/sample/deploy.py b/golang/sample/deploy.py deleted file mode 100644 index fa4cbd433549..000000000000 --- a/golang/sample/deploy.py +++ /dev/null @@ -1,56 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Get Started with TVM Go -======================= -""" -from __future__ import absolute_import, print_function - -import tvm -from tvm import te -import numpy as np - -# Global declarations of environment. - -tgt = "llvm" - -###################################################################### -# Describe the Computation -# ------------------------ -n = te.var("n") -A = te.placeholder((n,), name="A") -B = te.placeholder((n,), name="B") -C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - -###################################################################### -# Schedule the Computation -# ------------------------ -s = te.create_schedule(C.op) - -###################################################################### -# Compilation -# ----------- -fadd = tvm.build(s, [A, B, C], tgt, name="myadd") - -###################################################################### -# Save Compiled Module -# -------------------- -from tvm.contrib import cc -from tvm.contrib import utils - -fadd.save("deploy.o") -cc.create_shared("deploy.so", ["deploy.o"]) diff --git a/jvm/README.md b/jvm/README.md index c7535f0311b4..62b685010c2e 100644 --- a/jvm/README.md +++ b/jvm/README.md @@ -89,35 +89,6 @@ It is your job to verify the types of callback arguments, as well as the type of You can register the Java function by `Function.register` and use `Function.getFunction` to get the registered function later. -## Use TVM to Generate Shared Library - -There's nothing special for this part. The following Python snippet generate add_cpu.so which add two vectors on CPU. - -```python -import os -import tvm -from tvm import te -from tvm.contrib import cc, utils - -def test_add(target_dir): - n = te.var("n") - A = te.placeholder((n,), name='A') - B = te.placeholder((n,), name='B') - C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - s = te.create_schedule(C.op) - fadd = tvm.build(s, [A, B, C], "llvm", name="myadd") - - fadd.save(os.path.join(target_dir, "add_cpu.o")) - cc.create_shared(os.path.join(target_dir, "add_cpu.so"), - [os.path.join(target_dir, "add_cpu.o")]) - -if __name__ == "__main__": - import sys - if len(sys.argv) != 2: - sys.exit(-1) - test_add(sys.argv[1]) -``` - ## Run the Generated Shared Library The following code snippet demonstrate how to load generated shared library (add_cpu.so). diff --git a/jvm/core/src/test/scripts/test_add_cpu.py b/jvm/core/src/test/scripts/test_add_cpu.py deleted file mode 100644 index 9a93d4e74694..000000000000 --- a/jvm/core/src/test/scripts/test_add_cpu.py +++ /dev/null @@ -1,43 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import os - -import tvm -from tvm import te -from tvm.contrib import cc, utils - - -def test_add(target_dir): - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - s = te.create_schedule(C.op) - fadd = tvm.build(s, [A, B, C], "llvm", name="myadd") - - fadd.save(os.path.join(target_dir, "add_cpu.o")) - cc.create_shared( - os.path.join(target_dir, "add_cpu.so"), [os.path.join(target_dir, "add_cpu.o")] - ) - - -if __name__ == "__main__": - import sys - - if len(sys.argv) != 2: - sys.exit(-1) - test_add(sys.argv[1]) diff --git a/jvm/core/src/test/scripts/test_add_gpu.py b/jvm/core/src/test/scripts/test_add_gpu.py deleted file mode 100644 index 0eea5671baed..000000000000 --- a/jvm/core/src/test/scripts/test_add_gpu.py +++ /dev/null @@ -1,58 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import os - -import tvm -from tvm import te -from tvm.contrib import cc, nvcc, utils - - -@tvm.register_func("tvm_callback_cuda_compile", override=True) -def tvm_callback_cuda_compile(code, target): - ptx = nvcc.compile_cuda(code, target_format="ptx") - return ptx - - -def test_add(target_dir): - if not tvm.runtime.enabled("cuda"): - print("skip %s because cuda is not enabled..." % __file__) - return - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - - s = te.create_schedule(C.op) - - bx, tx = s[C].split(C.op.axis[0], factor=64) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - fadd_cuda = tvm.build(s, [A, B, C], tvm.target.Target("cuda", host="llvm"), name="myadd") - - fadd_cuda.save(os.path.join(target_dir, "add_cuda.o")) - fadd_cuda.imported_modules[0].save(os.path.join(target_dir, "add_cuda.ptx")) - cc.create_shared( - os.path.join(target_dir, "add_cuda.so"), [os.path.join(target_dir, "add_cuda.o")] - ) - - -if __name__ == "__main__": - import sys - - if len(sys.argv) != 2: - sys.exit(-1) - test_add(sys.argv[1]) diff --git a/python/tvm/contrib/peak.py b/python/tvm/contrib/peak.py deleted file mode 100644 index 78dae846d6ca..000000000000 --- a/python/tvm/contrib/peak.py +++ /dev/null @@ -1,394 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name -"""measure bandwidth and compute peak""" - -import logging -import tvm -from tvm import te -from tvm.target import Target -from . import utils -from .. import rpc - - -def _convert_to_remote(func, remote): - """convert module function to remote rpc function""" - temp = utils.tempdir() - path_dso = temp.relpath("tmp_func.tar") - func.export_library(path_dso) - - remote.upload(path_dso) - func = remote.load_module("tmp_func.tar") - return func - - -def measure_bandwidth_sum( - total_item, - item_per_thread, - stride, - base_type, - bits, - lanes, - target, - target_host, - remote, - dev, - n_times, -): - """measure memory bandwidth of gpu by product reduction for a given type - - The IR for measurement is - - for each thread - for i in 1..num_per_thread: - y[global_id] = y[global_id] * x[base + i * stride] - - Parameters - ---------- - total_item: int - number of elements in input array - item_per_thread: int - number of elements each thread accumulates - stride: int - stride in memory access - base_type: str - can be "int", "float" - bits: int - can be 16, 32 - lanes: int - lane of the vector type, can be 1, 2, 4, 8, 16 - target: :any:`tvm.target.Target` - the target and option of the compilation. - target_host : str or :any:`tvm.target.Target` - host compilation target - dev: Device - the device of array - remote: tvm.rpc.RPCSession - remote rpc session - n_times: int - number of runs for taking mean - - Returns - ------- - GBPS: float - gigabyte per second - """ - target, target_host = Target.canon_target_and_host(target, target_host) - - n, m = total_item, item_per_thread - n //= lanes - - base_type = str(base_type) + str(bits) - dtype = base_type if lanes == 1 else base_type + "x" + str(lanes) - - k = te.reduce_axis((0, m), name="k") - - x = te.placeholder((n,), dtype=dtype, name="x") - op = te.comm_reducer(lambda x, y: x * y, lambda t: tvm.tir.const(1, dtype=t), name="sum") - y = te.compute( - (n // m,), lambda i: op(x[i // stride * stride * m + i % stride + k * stride], axis=k) - ) - s = te.create_schedule(y.op) - - yo, yi = s[y].split(y.op.axis[0], target.max_num_threads) - s[y].bind(yo, te.thread_axis("blockIdx.x")) - s[y].bind(yi, te.thread_axis("threadIdx.x")) - s[y].unroll(k) - - try: - func = tvm.build(s, [x, y], target) - - x = tvm.nd.empty((n,), dtype=dtype, device=dev) - y = tvm.nd.empty((n // m,), dtype=dtype, device=dev) - - func = _convert_to_remote(func, remote) - time_f = func.time_evaluator(func.entry_name, dev, number=n_times) - time = time_f(x, y).mean - except tvm._ffi.base.TVMError: - # build error (occur when device does not support half) - return -1 - - return 1.0 * (total_item * bits / 8) / 1e9 / time - - -def measure_bandwidth_all_types( - total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True -): - """measure memory bandwidth for all types - - Parameters - ---------- - total_item: int - number of elements in input array - item_per_thread: int - number of elements each thread accmulates - n_times: int - number of runs for averaging - target: :any:`tvm.target.Target` - the target and option of the compilation. - target_host : str or :any:`tvm.target.Target` - host compilation target - remote: tvm.rpc.RPCSession - remote rpc session - dev: Device - the device of array - verbose: bool - whether outputs immediate result - - Returns - ------- - result: list - a list of (type_name, GBPS) pairs - """ - target, target_host = Target.canon_target_and_host(target, target_host) - max_threads = target.max_num_threads - - result = [] - for base_type in ["float"]: - for bits in [32]: - for lanes in [1, 2, 4, 8, 16]: - max_speed = -1e9 - # try different strides - for stride in [max_threads, total_item // (lanes * item_per_thread)]: - speed = measure_bandwidth_sum( - total_item, - item_per_thread, - stride, - base_type, - bits, - lanes, - target, - target_host, - remote, - dev, - n_times, - ) - max_speed = max(max_speed, speed) - type_name = base_type + str(bits) - result.append([f"{type_name}x{lanes}", max_speed]) - if verbose: - logging.info("\t%-10s %.2f GBPS", result[-1][0], result[-1][1]) - return result - - -def measure_compute_mad( - total_item, item_per_thread, base_type, bits, lanes, target, target_host, remote, dev, n_times -): - """measure peak compute speed by computing mad for a type - - The IR for measurement is - - for each thread - for i in 1..item_per_thread - x = mad(x, x, y) - y = mad(y, y, x) - - Parameters - ---------- - total_item: int - number of elements in input array - item_per_thread: int - number of operations each thread does - base_type: str - can be "int", "float" - bits: int - can be 16, 32 - lanes: int - lane of the vector type, can be 1, 2, 4, 8, 16 - target: :any:`tvm.target.Target` - the target and option of the compilation. - target_host : str or :any:`tvm.target.Target` - host compilation target - remote: tvm.rpc.RPCSession - if it is not None, use remote rpc session - dev: Device - the device of array - n_times: int - number of runs for taking mean - - Returns - ------- - GOPS: float - giga operation per second - """ - target, target_host = Target.canon_target_and_host(target, target_host) - - n = total_item - - if bits >= 64 or lanes >= 16: - n //= 2 - - max_threads = target.max_num_threads - - base_type = str(base_type) + str(bits) - dtype = base_type if lanes == 1 else base_type + "x" + str(lanes) - - def extern(ins, outs): - # pylint: disable=unused-argument - """construct measurement function by building IR directly""" - ib = tvm.tir.ir_builder.create() - - bx = te.thread_axis("blockIdx.x") - tx = te.thread_axis("threadIdx.x") - - ib.scope_attr(bx, "thread_extent", n // max_threads) - ib.scope_attr(tx, "thread_extent", max_threads) - - idx = bx.var * max_threads + tx.var - - a = ib.allocate(dtype, (1), name="a", scope="local") - b = ib.allocate(dtype, (1), name="b", scope="local") - - a[0] = outs[0].vload(idx, dtype) - b[0] = outs[0].vload(idx, dtype) - - if base_type.find("float") != -1: - - def mad_func(x, y): - return x * x + y - - else: - - def mad_func(x, y): - return y * y + x - - for _ in range(item_per_thread // 4 // lanes): - a[0] = mad_func(a[0], b[0]) - b[0] = mad_func(b[0], a[0]) - - ib.emit(outs[0].vstore(idx, b[0])) - return ib.get() - - y = te.extern((n,), [], extern, name="y", dtype=dtype) - s = te.create_schedule(y.op) - - try: - func = tvm.build(s, [y], target) - func = _convert_to_remote(func, remote) - time_f = func.time_evaluator(func.entry_name, dev, number=n_times) - y = tvm.nd.empty((n,), dtype=dtype, device=dev) - time = time_f(y).mean - except tvm._ffi.base.TVMError: - # build error (occur when device does not support half) - return -1 - - return 1.0 * (n * item_per_thread) / 1e9 / time - - -def measure_compute_all_types( - total_item, item_per_thread, n_times, target, target_host, remote, dev, verbose=True -): - """measure peak flops for all types - - Parameters - ---------- - total_item: int - number of elements in input array - item_per_thread: int - number of elements each thread accmulates - n_times: int - number of runs for averaging - target: :any:`tvm.target.Target` - the target and option of the compilation. - target_host : str or :any:`tvm.target.Target` - host compilation target - remote: tvm.rpc.RPCSession - remote rpc session - dev: Device - the device of array - verbose: bool - whether outputs immediate result - - Returns - ------- - result: list - a list of (type_name, GFLOPS/GIOPS) pairs - """ - target, target_host = Target.canon_target_and_host(target, target_host) - - result = [] - for base_type in ["float", "int"]: - for bits in [16, 32, 64]: - for lanes in [1, 2, 4, 8, 16]: - if base_type == "int" and bits != 32: # only measure int32 - continue - - max_speed = -1e9 - for per_thread in [item_per_thread // 2, item_per_thread, item_per_thread * 2]: - speed = measure_compute_mad( - total_item, - per_thread, - base_type, - bits, - lanes, - target, - target_host, - remote, - dev, - n_times, - ) - max_speed = max(max_speed, speed) - type_name = base_type + str(bits) - result.append([f"{type_name}x{lanes}", max_speed]) - - unit = "GFLOPS" if base_type == "float" else "GIOPS" - - if verbose: - logging.info("\t%-10s %.2f %s", result[-1][0], result[-1][1], unit) - - return result - - -def measure_peak_all(target, target_host, host, port): - """measure memory bandwidth and peak compute for gpu devices - - Parameters - ---------- - target: str or :any:`tvm.target.Target` - target_host: str - host: str - port: int - """ - - target, target_host = Target.canon_target_and_host(target, target_host) - remote = rpc.connect(host, port) - n_times = 20 - - bandwidth_total_item = 1 << 25 - bandwidth_item_per_thread = 32 - - compute_total_item = 1 << 21 - compute_item_per_thread = 4096 - - if str(target).startswith("opencl"): - dev = remote.cl() - elif str(target).startswith("cuda"): - dev = remote.cuda() - elif str(target).startswith("metal"): - dev = remote.metal() - else: - raise RuntimeError("Unsupported target") - - logging.info("========== measure memory bandwidth ==========") - measure_bandwidth_all_types( - bandwidth_total_item, bandwidth_item_per_thread, n_times, target, target_host, remote, dev - ) - - logging.info("========== measure peak compute ==========") - measure_compute_all_types( - compute_total_item, compute_item_per_thread, n_times, target, target_host, remote, dev - ) diff --git a/python/tvm/contrib/sparse.py b/python/tvm/contrib/sparse.py deleted file mode 100644 index 9f94ff24f906..000000000000 --- a/python/tvm/contrib/sparse.py +++ /dev/null @@ -1,204 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Tensor and Operation class for computation declaration.""" -# pylint: disable=invalid-name -import warnings -import numpy as _np -from tvm.runtime import ndarray as _nd -from tvm import te -from tvm.tir import expr as _expr -from tvm.te import tensor as _tensor - - -float32 = "float32" -itype = "int32" - - -class CSRNDArray(object): - """Sparse tensor object in CSR format.""" - - def __init__(self, arg1, device=None, shape=None): - """Construct a sparse matrix in CSR format. - - Parameters - ---------- - arg1 : numpy.ndarray or a tuple with (data, indices, indptr) - The corresponding a dense numpy array, - or a tuple for constructing a sparse matrix directly. - - device: Device - The corresponding device. - - shape : tuple of int - The shape of the array - """ - if isinstance(arg1, tuple): - assert len(arg1) == 3 - self.data, self.indices, self.indptr = arg1 - self.shape = shape - elif isinstance(arg1, _np.ndarray): - source_array = arg1 - ridx, cidx = _np.nonzero(source_array) - data = source_array[ridx, cidx] - self.data = _nd.array(data, device) - indices = _np.nonzero(source_array)[1].astype(itype) - self.indices = _nd.array(indices, device) - indptr = [0] + _np.apply_along_axis( - _np.count_nonzero, axis=1, arr=source_array - ).tolist() - indptr = _np.cumsum(_np.array(indptr, itype)).astype(itype) - self.indptr = _nd.array(indptr, device) - self.shape = source_array.shape - else: - raise RuntimeError( - f"Construct CSRNDArray with either a tuple (data, indices, indptr) " - f"or a numpy.array, can't handle type {type(arg1)}." - ) - self.stype = "csr" - self.dtype = self.data.dtype - assert self.shape is not None - assert isinstance(self.data, _nd.NDArray) - assert isinstance(self.indices, _nd.NDArray) - assert str(self.indices.dtype) == "int32" or str(self.indices.dtype) == "int64", str( - self.indices.dtype - ) - assert isinstance(self.indptr, _nd.NDArray) - assert str(self.indptr.dtype) == "int32" or str(self.indptr.dtype) == "int64", str( - self.indptr.dtype - ) - - def asnumpy(self): - """Construct a full matrix and convert it to numpy array. This API will be deprecated - in TVM v0.8 release. Please use `numpy` instead.""" - warnings.warn( - "CSRNDArray.asnumpy() will be deprecated in TVM v0.8 release. " - "Please use CSRNDArray.numpy() instead.", - DeprecationWarning, - ) - return self.numpy() - - def numpy(self): - """Construct a full matrix and convert it to numpy array.""" - full = _np.zeros(self.shape, self.dtype) - ridx = _np.diff(self.indptr.numpy()) - ridx = _np.hstack([_np.ones((v,), itype) * i for i, v in enumerate(ridx)]) - full[ridx, self.indices.numpy().astype(itype)] = self.data.numpy() - return full - - -def array(source_array, device=None, shape=None, stype="csr"): - """Construct a sparse NDArray from numpy.ndarray""" - ret = None - if stype == "csr": - ret = CSRNDArray(source_array, shape=shape, device=device) - else: - raise NotImplementedError(f"stype={stype} is not supported yet.") - return ret - - -class SparsePlaceholderOp(object): - """Placeholder class for sparse tensor representations.""" - - def __init__(self, shape, nonzeros, dtype, name): - # pylint: disable=unused-argument - """Contructing a bare bone structure for a sparse matrix - - Parameters - ---------- - shape: Tuple of Expr - The shape of the tensor - - nonzeros: int - The number of non-zero values - - dtype: str, optional - The data type of the tensor - - name: str, optional - The name hint of the tensor - """ - self.shape = shape - self.dtype = dtype - self.name = name - self.stype = "unknown" - - -class CSRPlaceholderOp(SparsePlaceholderOp): - """Placeholder class for CSR based sparse tensor representation.""" - - def __init__(self, shape, nonzeros, dtype, name): - """Contructing a bare bone structure for a csr_matrix - - Parameters - ---------- - shape: Tuple of Expr - The shape of the tensor - - nonzeros: int - The number of non-zero values - - dtype: str, optional - The data type of the tensor - - name: str, optional - The name hint of the tensor - """ - SparsePlaceholderOp.__init__(self, shape, nonzeros, dtype, name) - self.stype = "csr" - self.data = te.placeholder((nonzeros,), dtype=dtype, name=self.name + "_data") - self.indices = te.placeholder((nonzeros,), dtype=itype, name=self.name + "_indices") - self.indptr = te.placeholder((self.shape[0] + 1,), dtype=itype, name=self.name + "_indptr") - assert isinstance(self.data, _tensor.Tensor) - assert isinstance(self.indices, _tensor.Tensor) - assert isinstance(self.indptr, _tensor.Tensor) - - -def placeholder(shape, nonzeros=None, dtype=None, name="placeholder", stype=None): - """Construct an empty sparse tensor object. - - Parameters - ---------- - shape: Tuple of Expr - The shape of the tensor - - nonzeros: int - The number of non-zero values - - dtype: str, optional - The data type of the tensor - - name: str, optional - The name hint of the tensor - - stype: str, optional - The name storage type of the sparse tensor (e.g. csr, coo, ell) - - Returns - ------- - tensor: SparsePlaceholderOp - The created sparse tensor placeholder - """ - shape = (shape,) if isinstance(shape, _expr.PrimExpr) else shape - nonzeros = 0 if nonzeros is None else nonzeros - dtype = float32 if dtype is None else dtype - stype = "csr" if stype is None else stype - ret = None - if stype == "csr": - ret = CSRPlaceholderOp(shape=shape, nonzeros=nonzeros, dtype=dtype, name=name) - else: - raise NotImplementedError(f"stype={stype} is not supported yet.") - return ret diff --git a/python/tvm/contrib/tedd.py b/python/tvm/contrib/tedd.py deleted file mode 100644 index 680297729789..000000000000 --- a/python/tvm/contrib/tedd.py +++ /dev/null @@ -1,798 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=import-outside-toplevel, nested-min-max -"""Tensor Expression Debug Display (TEDD), visualizing Tensor Expression""" -import html -import json -import warnings -from graphviz import Digraph -from graphviz import Source -import tvm - -TVMDD_TABLE_BODY_WIDTH = 30 -# Must match enum IterVarType defined in include/tvm/expr.h -ITERVAR_TYPE_STRING_MAP = { - 0: ("kDataPar", "#FFFFFF"), - 1: ("kThreadIndex", "#2980B9"), - 2: ("kCommReduce", "#FAD7A0"), - 3: ("kOrdered", "#D35400"), - 4: ("kOpaque", "#ABB2B9"), - 5: ("kUnrolled", "#D2B4DE"), - 6: ("kVectorized", "#AED6F1"), - 7: ("kParallelized", "#F5B7B1"), - 8: ("kTensorized", "#A9DFBF"), -} - -PALETTE = { - 0: "#000000", - 1: "#922B21", - 2: "#76448A", - 3: "#1F618D", - 4: "#148F77", - 5: "#B7950B", - 6: "#AF601A", - 7: "#F5B7B1", - 8: "#A9DFBF", -} - -PALETTE_SIZE = 9 - - -def dom_path_to_string(dom_path, prefix=""): - path_string = prefix - for index in dom_path: - path_string = path_string + "_" + str(index) - return path_string - - -def insert_dot_id(sch): - """Insert unique ID for each node in the DOM tree. - They are used as Dot node ID. - """ - for stage_idx, stage in enumerate(sch["stages"]): - dom_path = [stage_idx] - stage["id"] = dom_path_to_string(dom_path, stage["type"]) - for itervar_idx, itervar in enumerate(stage["all_itervars"]): - dom_path = [stage_idx, itervar_idx] - itervar["id"] = dom_path_to_string(dom_path, itervar["type"]) - for rel_idx, rel in enumerate(stage["relations"]): - dom_path = [stage_idx, rel_idx] - rel["id"] = dom_path_to_string(dom_path, rel["type"]) - for tensor_idx, tensor in enumerate(stage["output_tensors"]): - dom_path = [stage_idx, tensor_idx] - tensor["id"] = dom_path_to_string(dom_path, tensor["type"]) - return sch - - -def itervar_equal(iv_a, iv_b): - """A helper method that compares the equality of two iterative variables""" - # Adopt the following method to assure the equality between two itervars. - # The plain comparison might fail (i.e. iv_a == iv_b) after the change of - # domain bounds from InferBound. - def _var_equal(v_a, v_b): - condtions = [ - v_a.name == v_b.name, - v_a.dtype == v_b.dtype, - v_a.type_annotation == v_b.type_annotation, - ] - return all(c for c in condtions) - - condtions = [ - _var_equal(iv_a.var, iv_b.var), - iv_a.iter_type == iv_b.iter_type, - iv_a.thread_tag == iv_b.thread_tag, - ] - return all(c for c in condtions) - - -class ObjectManager: - """A helper class tracking schedule objects, e.g. stage, IterVar, - relationship, and tensor, to their DOM path.""" - - def __init__(self, sch): - self.dict = {} - for stage_idx, stage in enumerate(sch.stages): - self.dict[stage] = [stage_idx] - for itervar_idx, itervar in enumerate(stage.all_iter_vars): - self.dict[itervar] = [stage_idx, itervar_idx] - # the itervars of leaf should also be mapped to the original one - for leaf_iv in stage.leaf_iter_vars: - if itervar_equal(leaf_iv, itervar): - self.dict[leaf_iv] = [stage_idx, itervar_idx] - for rel_idx, rel in enumerate(stage.relations): - self.dict[rel] = [stage_idx, rel_idx] - for tensor_idx in range(stage.op.num_outputs): - self.dict[frozenset({stage.op.name, tensor_idx})] = [stage_idx, tensor_idx] - - def get_dom_path(self, obj): - if obj is None: - return None - assert obj in self.dict, "Node is no found." - return self.dict[obj] - - -def get_or_create_dot_id(obj, prefix="", assert_on_missing=False): - """If obj's ID has been registered, return it. - If not, either assert or create a unique and legal ID, register and - return it, according to assert_on_missing. - ID must be a unique and legal Dotty ID. - - Parameters - ---------- - obj : objet - Serve as the key to the ID. - - prefix : string - Prefix to attach to the ID. Usually use obj's non-unique - name as prefix. - - assert_on_missing : bool - Assert or not if object doesn't have a registered ID. - """ - prefix = prefix.replace(".", "_") - if not hasattr(get_or_create_dot_id, "obj_id_dict"): - get_or_create_dot_id.obj_id_dict = {} - if obj not in get_or_create_dot_id.obj_id_dict: - if assert_on_missing: - assert False, "dot_id " + str(obj) + " has not been registered." - else: - get_or_create_dot_id.obj_id_dict[obj] = prefix + hex(id(obj)) - return get_or_create_dot_id.obj_id_dict[obj] - - -def get_port_id(is_input, index): - return "I_" + str(index) if is_input else "O_" + str(index) - - -def get_itervar_type_info(iter_type): - assert iter_type < len(ITERVAR_TYPE_STRING_MAP), "Unknown IterVar type: " + str(iter_type) - return ITERVAR_TYPE_STRING_MAP[iter_type] - - -def get_itervar_label_color(itervar, iv_type): - type_info = get_itervar_type_info(iv_type) - return ( - linebrk(str(itervar["name"]) + "(" + type_info[0] + ")", TVMDD_TABLE_BODY_WIDTH), - type_info[1], - ) - - -def linebrk(s, n): - """Break input string s with
for every n charactors.""" - result = "" - j = 0 - for i, c in enumerate(s): - if j == n and i != len(s) - 1: - result = result + "\n" - j = 0 - j = j + 1 - result = result + c - result = html.escape(str(result), quote=True) - result = result.replace("\n", "
") - return result - - -def create_graph(name="", rankdir="BT"): - graph = Digraph(name=name) - graph.graph_attr["rankdir"] = rankdir - return graph - - -def itervar_label(itervar, index, index_color, label): - return ( - '' - + str(index) - + '' - + label - + "
" - + str(itervar["properties"]["range"]) - + "" - ) - - -def stage_label(stage): - return stage["name"] + "
Scope: " + stage["properties"]["scope"] - - -def legend_label(): - """Generate legend labels.""" - label = '<' - for iter_type in ITERVAR_TYPE_STRING_MAP: - name, color = ITERVAR_TYPE_STRING_MAP[iter_type] - label += ( - '' + '" - ) - label += "
' + name + "
>" - return label - - -def leaf_itervars(stage): - filtered = filter(lambda x: (x["index"] >= 0), stage["all_itervars"]) - return sorted(filtered, key=lambda x: x["index"]) - - -def legend_dot(g): - with g.subgraph(name="cluster_legend") as subgraph: - subgraph.attr(label="Legend") - label = legend_label() - subgraph.node("legend", label, shape="none", margin="0") - - -def extract_dom_for_viz(sch, need_range=True): - json_str = dump_json(sch, need_range) - s = json.loads(json_str) - s = insert_dot_id(s) - return s - - -def dump_graph(dot_string, show_svg=True, dot_file_path="", output_dot_string=False): - """Output dot_string in various formats.""" - if dot_file_path: - try: - dot_file = open(dot_file_path, "w+") - dot_file.write(dot_string) - dot_file.close() - except IOError: - print("Cannot open file: " + dot_file_path) - if show_svg: - from IPython.display import display - from IPython.display import SVG - - src = Source(dot_string) - display(SVG(src.pipe(format="svg"))) - if output_dot_string: - return dot_string - return None - - -def dump_json(sch, need_range): - """Serialize data for visualization from a schedule in JSON format. - - Parameters - ---------- - sch : schedule - The schedule object to serialize - - Returns - ------- - json : string - Serialized JSON string - """ - - def encode_itervar(itervar, stage, index, range_map): - """Extract and encode IterVar visualization data to a dictionary""" - ivrange = range_map[itervar] if range_map is not None and itervar in range_map else None - bind_thread = None - tensor_intrin = None - if itervar in stage.iter_var_attrs: - attr = stage.iter_var_attrs[itervar] - iv_type = attr.iter_type - # binding - bind_thread = str(attr.bind_thread.var) if attr.bind_thread is not None else None - # tensorization - if attr.tensor_intrin is not None: - tensor_intrin = str(attr.tensor_intrin.body) - # remove the final \n - tensor_intrin = tensor_intrin[0:-1] if tensor_intrin[-1] == "\n" else tensor_intrin - else: - tensor_intrin = None - else: - iv_type = itervar.iter_type - itervar_dict = { - "type": "IterVar", - "index": index, - "name": str(itervar.var), - "itervar_type": iv_type, - "properties": { - "thread": bind_thread, - "intrin": tensor_intrin, - "range": str(ivrange) if ivrange is not None else "range(N/A)", - }, - } - return itervar_dict - - def encode_itervars(stage, range_map): - """Extract and encode IterVars visualization data from a stage to a dictionary""" - - def get_leaf_itervar_index(itervar, leaf_iv): - for leaf_index, ivar in enumerate(leaf_iv): - if itervar_equal(ivar, itervar): - return leaf_index - return -1 - - itervars = [] - for itervar in stage.all_iter_vars: - leaf_index = get_leaf_itervar_index(itervar, stage.leaf_iter_vars) - itervars.append(encode_itervar(itervar, stage, leaf_index, range_map)) - return itervars - - def encode_itervar_relation(obj_manager, rel): - """Extract and encode IterVar Relationship visualization data to a dictionary""" - rel_type = type(rel) - if rel_type is tvm.te.schedule.Split: - node_type = "Split_Relation" - rel_dict = { - "type": node_type, - "parent": obj_manager.get_dom_path(rel.parent), - "outer": obj_manager.get_dom_path(rel.outer), - "inner": obj_manager.get_dom_path(rel.inner), - } - elif rel_type is tvm.te.schedule.Fuse: - node_type = "Fuse_Relation" - rel_dict = { - "type": node_type, - "fused": obj_manager.get_dom_path(rel.fused), - "outer": obj_manager.get_dom_path(rel.outer), - "inner": obj_manager.get_dom_path(rel.inner), - } - elif rel_type is tvm.te.schedule.Singleton: - node_type = "Singleton_Relation" - rel_dict = { - "type": node_type, - "iter": obj_manager.get_dom_path(rel.iter), - } - else: - return None - return rel_dict - - def encode_itervar_relations(obj_manager, stage): - relations = [] - for i in range(len(stage.relations)): - rel = encode_itervar_relation(obj_manager, stage.relations[i]) - if rel is not None: - relations.append(rel) - return relations - - def encode_tensor(obj_manager, tensor, stage): - """Extract and encode tensor visualization data to a dictionary""" - tensor_dict = { - "type": "Tensor", - "source": obj_manager.get_dom_path(stage), - "value_index": tensor.value_index, - "shape": str(tensor.op.output(tensor.value_index).shape), - "data_type": tensor.op.output(tensor.value_index).dtype, - } - return tensor_dict - - def encode_tensors(obj_manager, stage): - tensors = [] - for i in range(stage.op.num_outputs): - tensor = stage.op.output(i) - tensors.append(encode_tensor(obj_manager, tensor, stage)) - tensors.sort(key=lambda tensor: tensor["value_index"]) - return tensors - - def encode_stage(obj_manager, stage, range_map): - """Extract and encode stage visualization data to a dictionary""" - stage_dict = { - "type": "Stage", - "name": stage.op.name, - "attaching_to": obj_manager.get_dom_path(stage.attach_ivar), - "compute": str(stage.op.body) if hasattr(stage.op, "body") else None, - "properties": { - "scope": stage.scope, - }, - "all_itervars": encode_itervars(stage, range_map), - "relations": encode_itervar_relations(obj_manager, stage), - "input_tensors": [ - obj_manager.get_dom_path(frozenset({tensor.op.name, tensor.value_index})) - for tensor in stage.op.input_tensors - ], - "output_tensors": encode_tensors(obj_manager, stage), - } - return stage_dict - - def encode_schedule(sch, need_range): - """Extract and encode data from a schedule for visualization to a nested dictionary. - It is useful for JSON to serialize schedule. - - Parameters - ---------- - sch : schedule - The schedule object to extract - - Returns - ------- - dict : dictionary - A nested dictionary - """ - assert isinstance( - sch, tvm.te.schedule.Schedule - ), "Input is not a tvm.te.schedule.Schedule object." - range_map = None - if need_range: - try: - range_map = tvm.te.schedule.InferBound(sch) - except tvm._ffi.base.TVMError as expt: - warnings.warn( - "Ranges are not available, because InferBound fails with the following error:\n" - + str(expt) - ) - - obj_manager = ObjectManager(sch) - stages = [] - for stage in sch.stages: - stages.append(encode_stage(obj_manager, stage, range_map)) - return { - "type": "Schedule", - "stages": stages, - } - - return json.dumps(sch, default=lambda s: encode_schedule(s, need_range)) - - -def viz_schedule_tree(sch, show_svg=False, dot_file_path="", output_dot_string=False): - """Top level API to render schedule tree - - Parameters - ---------- - sch : schedule - The schedule object to visualize - - show_svg : bool - Display graph as SVG, useful for Jupyter notebooks. - - dot_file_path : string - Dot file to save the graph. - - output_dot_string : bool - Return dot file content or an empty string. - - Returns - ------- - dot_string : string - Dot file content or an empty string according to output_dot_string - - Examples - -------- - The following code writes a schedule tree to a dot file. - - .. code-block:: python - tedd.viz_schedule_tree(s, dot_file_path = '/tmp/example.dot') - - Use the following code to render a SVG graph in a Jupyter notebook. - - .. code-block:: python - tedd.viz_schedule_tree(s, show_svg = True) - """ - - def create_schedule_tree_graph(name=""): - return create_graph(name=name, rankdir="BT") - - def root_dot(g): - g.node("ROOT", "ROOT", shape="oval", margin="0") - - def stage_node_dot(g, stage): - node_label = stage_node_label(stage) - g.node(stage["id"], node_label, shape="none", margin="0") - - def stage_node_label(stage): - """Return a html format label for the given stage.""" - label = ( - '<" - ) - - for leafiv in leaf_itervars(stage): - iv_type = leafiv["itervar_type"] - var_attr_label = "" - if "thread" in leafiv["properties"] and leafiv["properties"]["thread"] is not None: - var_attr_label = ( - var_attr_label - + '
(' - + str(leafiv["properties"]["thread"]) - + ")" - ) - if "intrin" in leafiv["properties"] and leafiv["properties"]["intrin"] is not None: - var_attr_label = ( - var_attr_label - + "
" - + linebrk( - "(tensor_intrin:" + str(leafiv["properties"]["intrin"]) + ")", - TVMDD_TABLE_BODY_WIDTH, - ) - ) - var_label, color = get_itervar_label_color(leafiv, iv_type) - label += itervar_label(leafiv, leafiv["index"], color, var_label + var_attr_label) - if stage["compute"] is not None: - label += ( - '" - ) - label += "
' + stage_label(stage) + "
' - + linebrk(str(stage["compute"]), TVMDD_TABLE_BODY_WIDTH) - + "
>" - return label - - def compute_at_dot(g, stage): - """If the given stage attaches to another stage, create an edge from it - stage to its attach point; otherwise, create an edge to the ROOT. - """ - src = stage["id"] - dst = ( - dom_path_to_string([stage["attaching_to"][0]], "Stage") - + ":" - + dom_path_to_string(stage["attaching_to"], "IterVar") - if stage["attaching_to"] is not None - else "ROOT" - ) - color = ( - PALETTE[stage["attaching_to"][1] + 1] - if stage["attaching_to"] is not None and stage["attaching_to"][1] < PALETTE_SIZE - 1 - else PALETTE[0] - ) - g.edge(src, dst, color=color) - - graph = create_schedule_tree_graph("Schedule Tree") - s = extract_dom_for_viz(sch) - legend_dot(graph) - for stage in s["stages"]: - stage_node_dot(graph, stage) - for stage in s["stages"]: - compute_at_dot(graph, stage) - root_dot(graph) - return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string) - - -def viz_itervar_relationship_graph(sch, show_svg=False, dot_file_path="", output_dot_string=False): - """Top level API to render IterVar relationship graph - - Parameters - ---------- - sch : schedule - The schedule object to visualize - - show_svg : bool - Display graph as SVG, useful for Jupyter notebooks. - - dot_file_path : string - Dot file to save the graph. - - output_dot_string : bool - Return dot file content or an empty string. - - Examples - -------- - The following code writes Ian tervar relationship graph to a dot file. - - .. code-block:: python - tedd.viz_def viz_itervar_relationship_graph(sch, - (s, dot_file_path = '/tmp/example.dot') - - Use the following code to render a SVG graph in a Jupyter notebook. - - .. code-block:: python - tedd.viz_def viz_itervar_relationship_graph(sch, - (s, show_svg = True) - """ - - def create_itervar_relation_graph(name=""): - return create_graph(name=name, rankdir="TB") - - def itervar_node_dot(g, itervar, iv_type, index): - label = itervar_node_label(itervar, iv_type, index) - g.node(itervar["id"], label, shape="none", margin="0") - - def itervar_node_label(itervar, iv_type, index): - label = ( - '<' - + itervar_label( - itervar, - index, - get_itervar_label_color(itervar, iv_type)[1], - get_itervar_label_color(itervar, iv_type)[0], - ) - + "
>" - ) - return label - - def itervar_relation_node_dot(g, node_id, node_label, input_ports, output_ports): - label = itervar_relation_node_label(node_label, input_ports, output_ports) - g.node(node_id, label, shape="none", margin="0") - - def itervar_relation_node_label(node_label, input_ports, output_ports): - """Return a html format label for an itervar relationship node - including node_label and input/output ports. - """ - label = '<' + "" - max_port_num = max(len(input_ports), len(output_ports)) - for i in range(max_port_num): - if i < len(input_ports): - input_port = input_ports[i] - label += '" - else: - label += '' - label += "" - label += ( - '" - ) - label += "" - for i in range(max_port_num): - if i < len(output_ports): - output_port = output_ports[i] - label += ( - '" - ) - else: - label += '' - label += "" - label += "
' + input_port + "
' - + node_label - + "
' + output_port + "
>" - return label - - def itervar_relation_dot(g, node, node_id): - """Create an itervar relationship node.""" - node_type = node["type"] - if node_type == "Split_Relation": - node_type = "Split" - itervar_relation_node_dot(g, node_id, node_type, ["Input"], ["Outer", "Inner"]) - parent = dom_path_to_string(node["parent"], "IterVar") - outer = dom_path_to_string(node["outer"], "IterVar") - inner = dom_path_to_string(node["inner"], "IterVar") - g.edge(parent + ":itervar", node_id + ":Input") - g.edge(node_id + ":Outer", outer + ":itervar") - g.edge(node_id + ":Inner", inner + ":itervar") - elif node_type == "Fuse_Relation": - node_type = "Fuse" - itervar_relation_node_dot(g, node_id, node_type, ["Outer", "Inner"], ["Fused"]) - fused = dom_path_to_string(node["fused"], "IterVar") - outer = dom_path_to_string(node["outer"], "IterVar") - inner = dom_path_to_string(node["inner"], "IterVar") - g.edge(outer + ":itervar", node_id + ":Outer") - g.edge(inner + ":itervar", node_id + ":Inner") - g.edge(node_id + ":Fused", fused + ":itervar") - elif node_type == "Singleton_Relation": - node_type = "Singleton" - itervar_relation_node_dot(g, node_id, node_type, [], ["Iter"]) - itervar = dom_path_to_string(node["inner"], "IterVar") - g.edge(node_id + ":Iter", itervar + ":itervar") - else: - assert False, "Unknown IterVarRelationNode: " + node_type - - def stage_node_dot(g, stage): - """Create a stage node.""" - with g.subgraph(name="cluster_" + stage["id"]) as subgraph: - subgraph.attr(label=stage["name"]) - if stage["all_itervars"]: - for itervar in stage["all_itervars"]: - iv_type = itervar["itervar_type"] - itervar_node_dot(subgraph, itervar, iv_type, itervar["index"]) - for rel in stage["relations"]: - node_id = rel["id"] - itervar_relation_dot(subgraph, rel, node_id) - else: - subgraph.node(stage["name"] + "_placeholder", style="invis") - - graph = create_itervar_relation_graph("IterVar Relationship Graph") - s = extract_dom_for_viz(sch) - legend_dot(graph) - for stage in s["stages"]: - stage_node_dot(graph, stage) - - return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string) - - -def viz_dataflow_graph(sch, show_svg=False, dot_file_path="", output_dot_string=False): - """Top level API to render dataflow graph - - Parameters - ---------- - sch : schedule - The schedule object to visualize - - show_svg : bool - Display graph as SVG, useful for Jupyter notebooks. - - dot_file_path : string - Dot file to save the graph. - - output_dot_string : bool - Return dot file content or an empty string. - - Examples - -------- - The following code writes a dataflow graph to a dot file. - - .. code-block:: python - tedd.viz_dataflow_graph(s, dot_file_path = '/tmp/example.dot') - - Use the following code to render a SVG graph in a Jupyter notebook. - - .. code-block:: python - tedd.viz_dataflow_graph(s, show_svg = True)""" - - def create_dataflow_graph(name=""): - return create_graph(name=name, rankdir="LR") - - def tensor_node_dot(g, tensor): - """Create a tensor node.""" - label = tensor_node_label(tensor) - g.node(tensor["id"], label, shape="oval", margin="0") - - def tensor_node_label(tensor): - """Return a html format label for the given tensor.""" - label = str(tensor["shape"]) + "\n" + str(tensor["data_type"]) - return label - - def stage_node_dot(g, stage): - """Create a stage node.""" - label = stage_node_label(stage) - g.node(stage["id"], label, shape="none", margin="0") - - def stage_node_label(stage): - """Return a html format label for the given stage.""" - rows = max(1, max(len(stage["output_tensors"]), len(stage["input_tensors"]))) - label = '<' - for i in range(rows): - label += "" - if i < len(stage["input_tensors"]): - port_id = get_port_id(True, i) - label += ( - '" - ) - else: - label += '' - if i == 0: - label += ( - '" - ) - if i < len(stage["output_tensors"]): - port_id = get_port_id(False, i) - label += ( - '" - ) - else: - label += '' - label += "" - label += "
' + str(i) + "' - + stage_label(stage) - + "' + str(i) + "
>" - return label - - def dfg_dot(g, sch): - """Create edges among stages.""" - stages = sch["stages"] - for stage in stages: - for i in range(len(stage["input_tensors"])): - src = dom_path_to_string(stage["input_tensors"][i], "Tensor") - dst = stage["id"] + ":" + get_port_id(True, i) - g.edge(src, dst) - for i in range(len(stage["output_tensors"])): - src = stage["id"] + ":" + get_port_id(False, i) - dst = stage["output_tensors"][i]["id"] - g.edge(src, dst) - - graph = create_dataflow_graph("Dataflow Graph") - s = extract_dom_for_viz(sch, need_range=False) - for stage in s["stages"]: - stage_node_dot(graph, stage) - for tensor in stage["output_tensors"]: - tensor_node_dot(graph, tensor) - - dfg_dot(graph, s) - - return dump_graph(graph.source, show_svg, dot_file_path, output_dot_string) diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py index fb325de1d3ab..94006111ffa2 100644 --- a/python/tvm/driver/build_module.py +++ b/python/tvm/driver/build_module.py @@ -17,106 +17,36 @@ # pylint: disable=invalid-name """The build utils in python.""" -from typing import Union, Optional, List, Mapping +from typing import Union, Optional -import warnings import tvm.tir -from tvm import te -from tvm.runtime import Module from tvm.runtime import ndarray -from tvm.ir import container from tvm.tir import PrimFunc from tvm.ir.module import IRModule -from tvm.te import tensor from tvm.target import Target -from tvm.tir.buffer import Buffer -from tvm.tir.expr import Var from tvm.driver import _ffi_api as _driver_ffi from . import _ffi_api as ffi -def get_binds(args, compact=False, binds=None): - """Internal function to get binds and arg_list given arguments. - Parameters - ---------- - args : list of Buffer or Tensor or Var - The argument lists to the function. - compact : bool - If the statement has already bound to a compact buffer. - binds : dict of :any:`Tensor` to :any:`Buffer`, optional - Dictionary that maps the Tensor to Buffer which specified the data layout - requirement of the function. By default, a new compact buffer is created - for each tensor in the argument. - Returns - ------- - binds: dict - The bind specification - arg_list: list - The list of symbolic buffers of arguments. - """ - binds, arg_list = ffi.get_binds(args, compact, binds) - return binds, arg_list - - -def schedule_to_module( - sch: te.Schedule, - args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None, - name: str = "main", - binds: Optional[Mapping[tensor.Tensor, Buffer]] = None, -) -> IRModule: - """According to the given schedule, form a function. - - This is a low-level function intended for testing purposes, and - does not apply any optimization passes. In general, `tvm.lower` - and `tvm.build` should be used instead. - - Parameters - ---------- - sch : tvm.te.schedule.Schedule - The given scheduler to form the raw body - args : list of Buffer or Tensor or Var - The argument lists to the function. - name : str - The name of result function, default name is "main" - binds : dict of :any:`Tensor` to :any:`Buffer`, optional - The binds information - Returns - ------- - The body formed according to the given schedule - """ - return ffi.schedule_to_module(sch, args, name, binds) - - def lower( - inp: Union[te.Schedule, PrimFunc, IRModule], - args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None, + inp: Union[PrimFunc, IRModule], name: str = "main", - binds: Optional[Mapping[tensor.Tensor, Buffer]] = None, simple_mode: bool = False, ) -> IRModule: """Lowering step before build into target. Parameters ---------- - inp : Union[tvm.te.schedule.Schedule, tvm.tir.PrimFunc, IRModule] + inp : Union[tvm.tir.PrimFunc, IRModule] The TE schedule or TensorIR PrimFunc/IRModule to be built - args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, tir.Var]]] - The argument lists to the function for TE schedule. - - It should be None if we want to lower TensorIR. name : str The name of the result function. - binds : Optional[Mapping[tensor.Tensor, tvm.tir.Buffer]] - Dictionary that maps the Tensor to Buffer which specified the data layout - requirement of the function. By default, a new compact buffer is created - for each tensor in the argument. - simple_mode : bool Whether only output simple and compact statement, this will skip LoopPartition, api wrapper generation and Unrolling. @@ -130,139 +60,65 @@ def lower( return ffi.lower_module(inp, simple_mode) if isinstance(inp, PrimFunc): return ffi.lower_primfunc(inp, name, simple_mode) - if isinstance(inp, te.Schedule): - return ffi.lower_schedule(inp, args, name, binds, simple_mode) raise ValueError( f"Expected input to be an IRModule, PrimFunc or te.Schedule, but got {type(inp)}" ) def build( - inputs: Union[te.Schedule, PrimFunc, IRModule, Mapping[str, IRModule]], - args: Optional[List[Union[Buffer, tensor.Tensor, Var]]] = None, + inputs: Union[PrimFunc, IRModule], target: Optional[Union[str, Target]] = None, - target_host: Optional[Union[str, Target]] = None, - runtime: Optional[ - "tvm.relay.backend.Runtime" - ] = None, # Type is annotated this way to avoid cyclic dependency - name: Optional[str] = "default_function", - binds: Optional[Mapping[tensor.Tensor, Buffer]] = None, + name: str = "main", ): """Build a function with arguments as signature. Code will be generated for devices coupled with target information. Parameters ---------- - inputs : Union[tvm.te.schedule.Schedule, tvm.tir.PrimFunc, IRModule, Mapping[str, IRModule]] + input : Union[tvm.tir.PrimFunc, IRModule] The input to be built - args : Optional[List[Union[tvm.tir.Buffer, tensor.Tensor, tir.Var]]] - The argument lists to the function. - target : Optional[Union[str, Target]] The target and option of the compilation. - target_host : Optional[Union[str, Target]] - Host compilation target, if target is device. - When TVM compiles device specific program such as CUDA, - we also need host(CPU) side code to interact with the driver - setup the dimensions and parameters correctly. - target_host is used to specify the host side codegen target. - By default, llvm is used if it is enabled, - otherwise a stackvm interpreter is used. - - runtime : Optional[Runtime] - Runtime to generate artifacts for - - name : Optional[str] + name : str The name of result function. - binds : Optional[Mapping[tensor.Tensor, tvm.tir.Buffer]] - Dictionary that maps the binding of symbolic buffer to Tensor. - By default, a new buffer is created for each tensor in the argument. - Returns ------- ret : tvm.module A module that combines both host and device code. - Examples - ________ - There are two typical example uses of this function depending on the type - of the argument `inputs`: - 1. it is an IRModule. - - .. code-block:: python - - n = 2 - A = te.placeholder((n,), name='A') - B = te.placeholder((n,), name='B') - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') - s = tvm.te.create_schedule(C.op) - m = tvm.lower(s, [A, B, C], name="test_add") - rt_mod = tvm.build(m, target="llvm") - - 2. it is a dict of compilation target to IRModule. - - .. code-block:: python - - n = 2 - A = te.placeholder((n,), name='A') - B = te.placeholder((n,), name='B') - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') - s1 = tvm.te.create_schedule(C.op) - with tvm.target.cuda() as cuda_tgt: - s2 = topi.cuda.schedule_injective(cuda_tgt, [C]) - m1 = tvm.lower(s1, [A, B, C], name="test_add1") - m2 = tvm.lower(s2, [A, B, C], name="test_add2") - rt_mod = tvm.build({"llvm": m1, "cuda": m2}) - Note ---- See the note on :any:`tvm.target` on target string format. """ - if isinstance(inputs, te.Schedule): - if args is None: - raise ValueError("args must be given for build from schedule") - input_mod = lower(inputs, args, name=name, binds=binds) - elif isinstance(inputs, (list, tuple, container.Array)): - merged_mod = tvm.IRModule({}) - for x in inputs: - merged_mod.update(lower(x)) - input_mod = merged_mod - elif isinstance(inputs, PrimFunc): + if isinstance(inputs, PrimFunc): input_mod = lower(inputs, name=name) elif isinstance(inputs, tvm.IRModule): assert ( len(inputs.get_global_vars()) > 0 ), "Expected a non-empty IRModule, but the IRModule contained no functions." input_mod = lower(inputs) - elif not isinstance(inputs, (dict, container.Map)): - raise ValueError( - f"Inputs must be te.Schedule, IRModule, PrimFunc, " - f"or dict of target to IRModule, " - f"but got {type(inputs)}." - ) - - if not isinstance(inputs, (dict, container.Map)): - target = Target.current() if target is None else target - if target is None and isinstance(input_mod, tvm.IRModule): - target_mod = {} - for gvar, func in input_mod.functions.items(): - tgt = func.attrs["target"] if "target" in func.attrs else "llvm" - if tgt not in target_mod: - target_mod[tgt] = {} - target_mod[tgt][gvar] = func - - target_input_mod = {} - for tgt in target_mod.keys(): - tir_mod = tvm.IRModule(target_mod[tgt]) - tir_mod = tir_mod.with_attrs(input_mod.attrs) - target_input_mod[tgt] = tir_mod - else: - target_input_mod = {target: input_mod} else: - target_input_mod = {tgt: lower(mod) for tgt, mod in inputs.items()} + raise ValueError("Inputs must be IRModule or PrimFunc") + + target = Target.current() if target is None else target + if target is None and isinstance(input_mod, tvm.IRModule): + target_mod = {} + for gvar, func in input_mod.functions.items(): + tgt = func.attrs["target"] if "target" in func.attrs else "llvm" + if tgt not in target_mod: + target_mod[tgt] = {} + target_mod[tgt][gvar] = func + + target_input_mod = {} + for tgt in target_mod.keys(): + tir_mod = tvm.IRModule(target_mod[tgt]) + tir_mod = tir_mod.with_attrs(input_mod.attrs) + target_input_mod[tgt] = tir_mod + else: + target_input_mod = {target: input_mod} # Because modules can be created from a variety of sources, we annotate them # with the relevant attributes here to ensure they propagate @@ -271,18 +127,10 @@ def build( if not isinstance(tgt, (str, Target)): raise ValueError("The key of inputs must be str or " "Target when inputs is dict.") if not isinstance(mod, tvm.IRModule): - raise ValueError("inputs must be Schedule, IRModule, " "or dict of str to IRModule.") - annotated_mods[tgt] = mod.with_attr("runtime", runtime) + raise ValueError("inputs must be IRModule, " "or dict of str to IRModule.") + annotated_mods[tgt] = mod - # TODO(mbs): Both CompilationConfig and TIRToRuntime implement the same host target - # defaulting logic, but there's currently no way to get back the decided host. - if target_host is not None: - warnings.warn( - "target_host parameter is going to be deprecated. " - "Please pass in tvm.target.Target(target, host=target_host) instead." - ) - - annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host) + annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods) if not target_host: for tar, mod in annotated_mods.items(): device_type = ndarray.device(tar.kind.name, 0).device_type @@ -296,41 +144,4 @@ def build( rt_mod_host = _driver_ffi.tir_to_runtime(annotated_mods, target_host) - annotated_mods, target_host = Target.canon_target_map_and_host(annotated_mods, target_host) - - if not isinstance(target_host, Target): - target_host = Target(target_host) - - if str(runtime) == "crt" and runtime["system-lib"]: - if target_host.kind.name == "c": - create_csource_crt_metadata_module = tvm._ffi.get_global_func( - "runtime.CreateCSourceCrtMetadataModule" - ) - to_return = create_csource_crt_metadata_module([rt_mod_host], target_host, runtime) - elif target_host.kind.name == "llvm": - create_llvm_crt_metadata_module = tvm._ffi.get_global_func( - "runtime.CreateLLVMCrtMetadataModule" - ) - to_return = create_llvm_crt_metadata_module([rt_mod_host], target_host, runtime) - else: - to_return = rt_mod_host - - return OperatorModule.from_module(to_return, ir_module_by_target=annotated_mods, name=name) - - -class OperatorModule(Module): - """Wraps the Module returned by tvm.build() and captures additional outputs of that function.""" - - @classmethod - def from_module(cls, mod, **kwargs): - # NOTE(areusch): It is generally unsafe to continue using `mod` from this point forward. - # If an exception occurs in cls.__init__, handle will be deleted. For this reason, - # set mod.handle to None. - handle = mod.handle - mod.handle = None - return cls(handle, **kwargs) - - def __init__(self, handle, ir_module_by_target=None, name=None): - super(OperatorModule, self).__init__(handle) - self.ir_module_by_target = ir_module_by_target - self.name = name + return rt_mod_host diff --git a/python/tvm/exec/measure_peak.py b/python/tvm/exec/measure_peak.py deleted file mode 100644 index 178e60089245..000000000000 --- a/python/tvm/exec/measure_peak.py +++ /dev/null @@ -1,52 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""measure bandwidth and compute peak - -e.g. -python3 -m tvm.exec.measure_peak --target cuda --rpc-host 127.0.0.1 --rpc-port 9090 -python3 -m tvm.exec.measure_peak --target opencl --target-host "llvm -mtriple=aarch64-linux-gnu" \ - --rpc-host $TVM_OPENCL_DEVICE_HOST --rpc-port 9090 -""" - -import argparse -import logging - -from tvm.target import Target -from ..contrib.peak import measure_peak_all - - -def main(): - """Main function""" - parser = argparse.ArgumentParser() - parser.add_argument("--target", type=str, default="llvm", help="The build target") - parser.add_argument( - "--target-host", type=str, default=None, help="The host code compilation target" - ) - parser.add_argument( - "--rpc-host", type=str, default="127.0.0.1", help="the hostname of the server" - ) - parser.add_argument("--rpc-port", type=int, default=9090, help="The port of the RPC") - - args = parser.parse_args() - logging.basicConfig(level=logging.INFO) - - args.target, args.target_host = Target.canon_target_and_host(args.target, args.target_host) - measure_peak_all(args.target, args.target_host, args.rpc_host, args.rpc_port) - - -if __name__ == "__main__": - main() diff --git a/python/tvm/relax/frontend/torch/fx_translator.py b/python/tvm/relax/frontend/torch/fx_translator.py index ce1f284be6bc..d57d24bf2f77 100644 --- a/python/tvm/relax/frontend/torch/fx_translator.py +++ b/python/tvm/relax/frontend/torch/fx_translator.py @@ -99,7 +99,6 @@ def convert(node: fx.Node) -> relax.Var: ########## Neural Network ########## def _adaptive_avg_pool2d_module(self, node: fx.Node) -> relax.Var: - module = self.named_modules[node.target] x = self.env[node.args[0]] output_size = module.output_size diff --git a/python/tvm/relax/vm_build.py b/python/tvm/relax/vm_build.py index cfa4143b66c3..ac4d9698a072 100644 --- a/python/tvm/relax/vm_build.py +++ b/python/tvm/relax/vm_build.py @@ -179,10 +179,12 @@ def _vmcodegen( raise ValueError(f"Unknown exec_mode {exec_mode}") -def _autodetect_system_lib_req( - target: Optional[tvm.target.Target] = None, system_lib: Optional[bool] = None +def _auto_attach_system_lib_prefix( + tir_mod: tvm.IRModule, + target: Optional[tvm.target.Target] = None, + system_lib: Optional[bool] = None, ): - """Automatically detect system lib requirement""" + """Automatically detect system lib req and attach prefix attr""" if target is not None: host = target if target.host is None else target.host if system_lib is None: @@ -191,9 +193,9 @@ def _autodetect_system_lib_req( system_lib = True if system_lib: - # use packed-func to avoid relay dep. - return tvm.get_global_func("relay.backend.CreateRuntime")("cpp", {"system-lib": system_lib}) - return None + if tir_mod.get_attr("system_lib_prefix") is None: + return tir_mod.with_attr("system_lib_prefix", "") + return tir_mod def _vmlink( @@ -246,11 +248,8 @@ def _vmlink( relax_ext_libs = [] tir_ext_libs = [] if tir_mod is not None and len(tir_mod.get_global_vars()) > 0: - lib = tvm.build( - tir_mod, - target=target, - runtime=_autodetect_system_lib_req(target, system_lib), - ) + tir_mod = _auto_attach_system_lib_prefix(tir_mod, target, system_lib) + lib = tvm.build(tir_mod, target=target) for ext_mod in ext_libs: if ext_mod.is_device_module: tir_ext_libs.append(ext_mod) diff --git a/python/tvm/te/__init__.py b/python/tvm/te/__init__.py index 0907ea2ebf85..e7b394ebf76c 100644 --- a/python/tvm/te/__init__.py +++ b/python/tvm/te/__init__.py @@ -28,21 +28,11 @@ from tvm.tir import comm_reducer, min, max, sum from tvm.tir import add, subtract, multiply -from .schedule import ( - Schedule, - Stage, - create_schedule, - SpecializedCondition, - AXIS_SEPARATOR, -) from .tensor import TensorSlice, Tensor -from .tensor_intrin import decl_tensor_intrin from .tag import tag_scope from .operation import placeholder, compute, scan, extern, var, size_var, const -from .operation import thread_axis, reduce_axis +from .operation import thread_axis, reduce_axis, AXIS_SEPARATOR from .operation import create_prim_func from .operation import extern_primfunc -from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp, HybridOp -from .autodiff import gradient -from . import hybrid +from .tensor import PlaceholderOp, ComputeOp, TensorComputeOp, ScanOp, ExternOp diff --git a/python/tvm/te/autodiff.py b/python/tvm/te/autodiff.py deleted file mode 100644 index f8650839948d..000000000000 --- a/python/tvm/te/autodiff.py +++ /dev/null @@ -1,67 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Automatic differentiation of tensor expressions.""" -from . import _ffi_api - - -def gradient(output, inputs, head=None): - """Perform reverse-mode automatic differentiation. - - Parameters - ---------- - output : Tensor - The tensor to differentiate. - - inputs : List[Tensor] - The list of input tensors to be differentiated wrt. - - head : Tensor - The adjoint of the output, in other words, some tensor, by which the Jacobians - will be multiplied. Its shape must be of the form `prefix + output.shape`. - If `None` is passed, the identity tensor of shape `output.shape + output.shape` - will be used. - - Returns - ------- - tensors: List[Tensor] - The result gradient, in the same order as the inputs - - Example - ------- - .. code-block:: python - - x = tvm.placeholder((32, 3, 28, 28), name='x') - w1 = tvm.placeholder((10, 3, 3, 3), name='w1') - w2 = tvm.placeholder((10, 10, 3, 3), name='w2') - z1 = topi.nn.conv2d(x, w1, 1, 1, 1) - z2 = topi.nn.conv2d(z1, w2, 1, 1, 1) - y = topi.sum(z2) - - # produce gradients - [dw1, dw2] = tvm.gradient(y, [w1, w2]) - - # produce Jacobians - [jw1, jw2] = tvm.gradient(z2, [w1, w2]) - - # produce gradients, the head adjoint for z2 is provided manually - [dw1, dw2] = tvm.gradient(z2, [w1, w2], topi.full_like(z2, 1.0)) - - """ - if not isinstance(inputs, list): - inputs = [inputs] - return _ffi_api.Gradient(output, inputs, head) diff --git a/python/tvm/te/hybrid/__init__.py b/python/tvm/te/hybrid/__init__.py deleted file mode 100644 index cd320c6b209c..000000000000 --- a/python/tvm/te/hybrid/__init__.py +++ /dev/null @@ -1,101 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Hybrid Programming APIs of TVM Python Package. - -This package maps a subset of python to HalideIR so that: -1. Users can write some preliminary versions of the computation patterns -have not been supported yet and verify it across the real execution and -python semantic emulation. -2. So far, it is a text format dedicated to HalideIR Phase 0. Refer tvm.lower -for more details. A larger ambition of this module is to support all levels of -HalideIR. -""" - -# TODO(@were): Make this module more complete. -# 1. Support HalideIR dumping to Hybrid Script -# 2. Support multi-level HalideIR -import inspect -import tvm._ffi -import tvm.te.schedule -from tvm._ffi.base import decorate - -from .module import HybridModule -from .parser import source_to_op -from .utils import _pruned_source - - -def script(pyfunc): - """Decorate a python function as hybrid script. - - The hybrid function support emulation mode and parsing to - the internal language IR. - - Returns - ------- - hybrid_func : function - A decorated hybrid script function. - """ - # pylint: disable=import-outside-toplevel, missing-docstring - def wrapped_func(func, *args, **kwargs): - from .utils import _is_tvm_arg_types - - if _is_tvm_arg_types(args): - src = _pruned_source(func) - closure_vars = inspect.getclosurevars(func).nonlocals - closure_vars.update(inspect.getclosurevars(func).globals) - return source_to_op(src, args, func.__globals__, closure_vars) - - from .runtime import _enter_hybrid_runtime, _restore_runtime - - intersect = _enter_hybrid_runtime(func) - value = func(*args, **kwargs) - _restore_runtime(func, intersect) - return value - - return decorate(pyfunc, wrapped_func) - - -def build(sch, inputs, outputs, name="hybrid_func"): - """Dump the current schedule to hybrid module - - Parameters - ---------- - sch: tvm.te.Schedule - The schedule to be dumped - - inputs: An array of Tensors or Vars - The inputs of the function body - - outputs: An array of Tensors - The outputs of the function body - - Returns - ------- - module: HybridModule - The built results is wrapped in a HybridModule. - The usage of HybridModule is roughly the same as normal TVM-built modules. - """ - sch = sch.normalize() - bounds = tvm.te.schedule.InferBound(sch) - stmt = tvm.te.schedule.ScheduleOps(sch, bounds) - - src = _Dump(stmt, inputs, outputs, name) - - return HybridModule(src, name) - - -tvm._ffi._init_api("tvm.hybrid", __name__) diff --git a/python/tvm/te/hybrid/calls.py b/python/tvm/te/hybrid/calls.py deleted file mode 100644 index 948a0d7665ff..000000000000 --- a/python/tvm/te/hybrid/calls.py +++ /dev/null @@ -1,183 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Intrinsics of TVM-Python Hybrid Script for Python compilation time -semantic support.""" - -from tvm.runtime import const, convert -import tvm.te -from tvm.ir.container import Array -from tvm.target import Target -from tvm.tir import expr as _expr -from tvm.tir import call_intrin -from tvm.tir.stmt import ForKind - -from .utils import _internal_assert - -# pylint: disable=redefined-builtin,invalid-name - -LOOP_INTRIN = { - "range": ForKind.SERIAL, - "unroll": ForKind.UNROLLED, - "parallel": ForKind.PARALLEL, - "vectorize": ForKind.VECTORIZED, - "const_range": (ForKind.UNROLLED,), -} - - -def _range(annotation, args): - """Handling TVM loop types""" - n = args.__len__() - if n == 1: - low, ext = const(0, dtype="int32"), args[0] - else: - _internal_assert(n == 2, "A loop intrinsic should only have 1 or 2 arguments!") - low, ext = args[0], args[1] - if not tvm.tir.analysis.expr_deep_equal(low, const(0, dtype="int32")): - ext = ext - low - kind = LOOP_INTRIN[annotation] - iter_var = None - return iter_var, low, ext, kind - - -range = unroll = vectorize = parallel = const_range = _range # pylint: disable=invalid-name - - -def bind(func_id, args): - """Handling TVM thread binding""" - _internal_assert(func_id == "bind", "This function cannot be directly invoked!") - _internal_assert(args.__len__() == 2, "A loop bind should only have 2 arguments!") - _internal_assert(isinstance(args[0], str), "A loop bind's first argument should be a string!") - low, ext = const(0, "int32"), args[1] - iter_var = tvm.te.thread_axis((low, ext), args[0]) - kind = None - return iter_var, low, ext, kind - - -def _math_intrin(func_id, args): - # pylint: disable=import-outside-toplevel - from tvm.tir import op - - return getattr(op, func_id)(*args) - - -sqrt = ( - log -) = exp = tanh = sigmoid = power = popcount = round = _math_intrin # pylint: disable=invalid-name - - -def _min_max(func_id, args): - _internal_assert(args.__len__() == 2, "Max/Min function should have 2 elements") - return getattr(_expr, func_id.title())(args[0], args[1]) - - -min = max = _min_max # pylint: disable=invalid-name - - -def _allocate_tensor(func_id, args): - """Handling TVM tensor allocation. - You may refer hybrid.intrin.allocate for more details.""" - n = args.__len__() - _internal_assert( - isinstance(convert(args[0]), Array), "allocate's first argument should be a tuple of shape!" - ) - shape = args[0] - for i in shape: - _internal_assert(isinstance(i, (_expr.PrimExpr, int)), "The shape should be an expression") - if n > 1: - _internal_assert(isinstance(args[1], str), "The data type should be an str") - _internal_assert( - args[1].startswith("int") or args[1].startswith("float"), - "The data type should be either int or float!", - ) - dtype = args[1] - else: - dtype = "float32" - if n > 2: - _internal_assert(isinstance(args[2], str), "The data scope should be an string") - _internal_assert(func_id != "output_tensor", "Output tensor cannot specify scope") - scope = args[2] - else: - scope = "global" if func_id != "output_tensor" else "output" - return (shape, dtype, scope) - - -output_tensor = allocate = _allocate_tensor # pylint: disable=invalid-name - - -def len(func_id, args): - """Iterpret the len function""" - _internal_assert(args.__len__() == 1, "Only 1 argument is expected!") - _internal_assert(func_id == "len", "This function cannot be directly invoked!") - try: - return convert(args[0].__len__()) - except: # pylint: disable=bare-except - _internal_assert(args[0].shape.__len__() == 1, "Only one-dimension array can get len") - return convert(args[0].shape[0]) - - -def _cast(func_id, args): - _internal_assert( - args.__len__() == 1, - f"Casting to {func_id} only supports a single argument", - ) - # The FFI can handle any conversion of `args[0]` into PrimExpr, if - # required. - return _expr.Cast(func_id, args[0]) - - -float16 = float32 = float64 = _cast # pylint: disable=invalid-name -int8 = int16 = int32 = int64 = _cast # pylint: disable=invalid-name -uint8 = uint16 = uint32 = uint64 = _cast # pylint: disable=invalid-name - - -def ceil_div(func_id, args): - _internal_assert(func_id == "ceil_div", "This function cannot be directly invoked!") - _internal_assert(args.__len__() == 2, "2 arguments expected for division!") - a, b = args - return (a + b - 1) // b - - -def likely(func_id, args): - _internal_assert(args.__len__() == 1, "Only one expression can be likely") - _internal_assert(func_id == "likely", "This function cannot be directly invoked!") - return call_intrin(args[0].dtype, "tir.likely", *args) - - -def max_num_threads(func_id, args): - """Set the maximum number of threads.""" - _internal_assert(func_id == "max_num_threads", "This function cannot be directly invoked!") - _internal_assert(args.__len__() <= 1, "At most one argument accepted!") - if args.__len__() == 0: - res = Target.current().max_num_threads - else: - _internal_assert(isinstance(args[0], _expr.IntImm), "In tvm bool should be uint") - res = Target.current(args[0].value).max_num_threads - return convert(res) - - -def inf(func_id, args): - """Infinity""" - _internal_assert(func_id == "inf", "This function cannot be directly invoked!") - _internal_assert(args.__len__() == 1, "One argument accepted!") - return tvm.tir.max_value(args[0]) - - -def ninf(func_id, args): - """Negative infinity""" - _internal_assert(func_id == "ninf", "This function cannot be directly invoked!") - _internal_assert(args.__len__() == 1, "One argument accepted!") - return tvm.tir.min_value(args[0]) diff --git a/python/tvm/te/hybrid/module.py b/python/tvm/te/hybrid/module.py deleted file mode 100644 index 729805b31b6b..000000000000 --- a/python/tvm/te/hybrid/module.py +++ /dev/null @@ -1,113 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Methods and data structures to support dumping HalideIR to Hybrid Script. -This allows users to do quick hack to generated HalideIR and cast it back to -TVM modules. - -To enable this feature, you need to build with -DUSE_HYBRID_DUMP=ON. -""" - -import ast - -from tvm.contrib import utils -from .utils import _internal_assert -from .utils import _is_tvm_arg_types -from .parser import source_to_op - - -class HybridModule(object): - """The usage of Hybrid Module is very similar to conventional TVM module, - but conventional TVM module requires a function body which is already fully - lowered. This contradicts to the fact that Hybrid Module is originally a text - format for Phase 0 HalideIR. Thus, a totally separated module is defined.""" - - def __init__(self, src=None, name=None): - """The constructor of this a hybrid module - - Parameters - ---------- - src : str - The source code of this module - - name : str - The name of this module - """ - self.src_ = self.name = self.func_ = self.root_ = None - if src is not None: - temp = utils.tempdir() - dst = temp.relpath("script.py") - with open(dst, "w") as f: - f.write(f"import tvm\n@tvm.te.hybrid.script\n{src}") - - if name is not None: - self.name = name - self.load(dst) - - def __call__(self, *args): - if _is_tvm_arg_types(args): - return source_to_op(self.root_, args, globals(), {}) - return self.func_(*args) - - def get_source(self): - return self.src_ - - def save(self, path): - if not path.endswith(".py"): - path = path + ".py" - with open(path, "w") as f: - f.write(self.src_) - - def load(self, path): - """Load the module from a python file - - Parameters - ---------- - path : str - Path to the given python file - """ - with open(path, "r") as f: - self.src_ = f.read() - - src = self.src_ - - class FindFunc(ast.NodeVisitor): - """Find the function in module to be loaded module.""" - - # pylint: disable=invalid-name - def __init__(self): - self.name = None - self.root = None - - def visit_FunctionDef(self, node): - _internal_assert(self.name is None, "For now, only one function supported!") - self.name = node.name - _internal_assert(self.root is None, "For now, only one function supported!") - self.root = node - - root = ast.parse(src) - finder = FindFunc() - finder.visit(root) - _internal_assert(finder.name is not None and finder.root is not None, "No function found!") - if self.name is None: - self.name = finder.name - self.root_ = finder.root - - _, local_ = {}, {} - exec(self.src_, _, local_) # pylint: disable=exec-used - local_.pop("tvm") - assert len(local_) == 1 - self.func_ = list(local_.values())[0] diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py deleted file mode 100644 index bd5a060cd01c..000000000000 --- a/python/tvm/te/hybrid/parser.py +++ /dev/null @@ -1,658 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Hybrid Script Parser""" - -import ast -import operator -import logging -import sys -import numbers - -from enum import Enum -from tvm.ir import Array, Range -import tvm.runtime -import tvm.tir -import tvm.te -import tvm.te._ffi_api -import tvm.arith - -from tvm.tir import expr as _expr -from tvm.tir import stmt as _stmt -from tvm.te.tensor import Tensor, Operation -from tvm.tir import all as _all -from tvm.tir import any as _any - -from .utils import _internal_assert -from . import calls -from . import utils -from .preprocessor import determine_variable_usage - - -def concat_list_to_block(lst): - """Concatenate a list of Python IR nodes to HalideIR Block""" - if not lst: - return utils.make_nop() - n = len(lst) - if n == 1: - return lst[0] - return _stmt.SeqStmt(lst) - - -def visit_list_to_block(visit, lst): - """Visit and concatenate a list of Python IR nodes to HalideIR Block""" - lst = [visit(stmt) for stmt in lst if not utils.is_docstring(stmt)] - lst = [stmt for stmt in lst if not tvm.ir.structural_equal(stmt, utils.make_nop())] - if not lst: - return utils.make_nop() - return concat_list_to_block(lst) - - -class Symbol(Enum): - """Enumerates types in the symbol table""" - - Callable = 0 - Input = 1 - OutputBuffer = 2 - GlobalBuffer = 3 - LocalBuffer = 4 - SharedBuffer = 5 - ConstVar = 6 - BufferVar = 7 - LoopVar = 8 - ConstLoopVar = 9 - ThreadBind = 10 - - -def _floordiv(x, y): - if isinstance(x, _expr.ExprOp) or isinstance(y, _expr.ExprOp): - return tvm.tir.floordiv(x, y) - return operator.floordiv(x, y) - - -def _floormod(x, y): - if isinstance(x, _expr.ExprOp) or isinstance(y, _expr.ExprOp): - return tvm.tir.floormod(x, y) - return operator.mod(x, y) - - -class HybridParser(ast.NodeVisitor): - """Python AST visitor pass which finally lowers it to HalideIR""" - - _binop_maker = { - ast.Add: operator.add, - ast.Sub: operator.sub, - ast.Mult: operator.mul, - ast.Div: operator.div if sys.version_info[0] == 2 else operator.truediv, - ast.FloorDiv: _floordiv, - ast.Mod: _floormod, - ast.BitOr: operator.or_, - ast.BitAnd: operator.and_, - ast.BitXor: operator.xor, - ast.Gt: operator.gt, - ast.GtE: operator.ge, - ast.Lt: operator.lt, - ast.LtE: operator.le, - ast.Eq: operator.eq, - ast.NotEq: operator.ne, - ast.And: _all, - ast.Or: _any, - } - - _unaryop_maker = {ast.USub: operator.neg, ast.Invert: operator.invert, ast.Not: operator.not_} - - def __init__(self, args, usage, symbols, closure_vars, func_name=None): - """ - Parameters - ---------- - args: A list of tvm.te.placeholder or te.var - Provided by the user, the argument list of the function to be lowered. - - usage: A dict of variables used in last in this function - Provided by last lower pass, which collects this information - - symbols : list of str - The symbol list of the global context of the function. - - closure_vars: dict - A dict of external name reference captured by this function. - - Returns - ------- - func_name: str - The name of the function to be lowered; if not provided, - the compiler will use the name in the AST - """ - self.args = list(args) - self.usage = usage.copy() - - self.symbols = {} # Symbol table - for k, v in symbols.items(): - if callable(v): - self.add_symbol(k, Symbol.Callable, v) - - self.closure_vars = closure_vars - - self.binds = {} # Thread binds - self.device = 0 # Is it generating device - - self.func_name = func_name # The name of the function to be lowered - self.outputs = [] # Output tensors' name - self.side_effect = set() # Tensors with side effects - self.parsed_body = None # The parsed HalideIR body - self.analyzer = tvm.arith.Analyzer() - self.returned = False # If this function has a valid return - - def add_symbol(self, key, ty, val): # pylint: disable=invalid-name - """Add value to the symbol table context""" - if key in self.symbols.keys(): - old = str(self.symbols[key]) - new = str((ty, val)) - _internal_assert(False, f"Name conflict in symbol table! [{key}] {old} -> {new}") - - self.symbols[key] = ty, val - - if ty == Symbol.ThreadBind: - if val.var.name not in self.binds.keys(): - self.binds[val.var.name] = val - return - val_ = self.binds[val.var.name] - _internal_assert( - tvm.tir.analysis.expr_deep_equal(val_.dom.extent, val.dom.extent), - "Thread extents should be uniform!", - ) - self.symbols[key] = ty, val_ - - def wrap_up_realize(self, node, body): - """Wrap up all the variables which will no longer be used""" - to_pop = [] - for key, val in self.usage.items(): - _, level, _ = val - if key not in self.symbols: - # don't realize the symbols that are never visited - continue - if level != node: - continue - _internal_assert(key in self.symbols.keys(), f"Unknown symbol {key}!") - - ty, entry = self.symbols[key] # pylint: disable=invalid-name - if ty in [Symbol.Input, Symbol.OutputBuffer]: - continue - if "Buffer" in ty.name: - _buf = entry - _scope = "global" if ty is Symbol.BufferVar else ty.name[:-6].lower() - to_pop.append(key) - else: - continue - - if _scope == "global": - body = self.wrap_up_binds(body) - - _domain = [Range.from_min_extent(0, i) for i in _buf.shape] - _dtype = _buf.dtype - _true = tvm.runtime.convert(True) - body = tvm.tir.ProducerRealize(_buf, _domain, _true, body, tvm.runtime.convert(_scope)) - - for elem in to_pop: - self.symbols.pop(elem) - - return body - - def wrap_up_binds(self, body): - for _, iter_var in self.binds.items(): - ext = iter_var.dom.extent - body = tvm.tir.AttrStmt(iter_var, "thread_extent", ext, body) - self.binds = {} - return body - - # pylint: disable=invalid-name, missing-docstring - def visit_Module(self, node): - _internal_assert( - len(node.body) == 1, "Only one-function source code will be fed to this parser!" - ) - return self.visit(node.body[0]) - - def visit_FunctionDef(self, node): - _internal_assert( - len(node.args.args) == len(self.args), - "The number of arguments passed to the \ - function should be the same as it is defined!", - ) - if self.func_name is None: - self.func_name = node.name - for idx, arg in enumerate(node.args.args): - _attr = "id" if sys.version_info[0] < 3 else "arg" # To make py2 and 3 compatible - self.add_symbol(getattr(arg, _attr), Symbol.Input, self.args[idx]) - res = visit_list_to_block(self.visit, node.body) - res = self.wrap_up_realize(node, res) - return self.wrap_up_binds(res) - - def visit_Expr(self, node): - return self.visit(node.value) - - def visit_Name(self, node): - name = node.id - if sys.version_info[0] == 2 and name in ["True", "False"]: - return tvm.runtime.convert(ast.literal_eval(name)) - - if name in self.closure_vars: - return tvm.runtime.convert(self.closure_vars[name]) - - ty, entry = self.symbols[name] - _internal_assert(name in self.symbols, f"Unknown symbol {name}!") - if ty in [Symbol.LoopVar, Symbol.Input, Symbol.ConstLoopVar]: - return entry - if ty is Symbol.ThreadBind: - return entry.var - if ty is Symbol.ConstVar: - return entry if isinstance(node.ctx, ast.Load) else None - if ty is Symbol.BufferVar: - if isinstance(node.ctx, ast.Load): - return tvm.tir.ProducerLoad(entry, [tvm.runtime.const(0, "int32")]) - return entry, [tvm.runtime.const(0, "int32")] - # Do I need any assertion here? - return entry - - def visit_Num(self, node): - if isinstance(node.n, numbers.Integral): - dtype = "int32" - elif isinstance(node.n, float): - dtype = "float32" - else: - _internal_assert( - isinstance(node.n, bool), "The data type should be one of (int, float, bool)" - ) - dtype = "bool" - return tvm.runtime.const(node.n, dtype) - - def visit_NameConstant(self, node): - return tvm.tir.const(node.value) - - def visit_AugAssign(self, node): - buf = self.visit(node.target) - rhs = self.visit(node.value) - if isinstance(buf, tuple): - _internal_assert(len(buf) == 2, "LHS is supposed to be (buf, args)!") - buf, args = buf - else: - args = [tvm.runtime.const(0, "int32")] - _internal_assert(isinstance(buf, Tensor), "LHS is supposed to be Tensor!") - - read = tvm.tir.ProducerLoad(buf, args) - value = HybridParser._binop_maker[type(node.op)](read, rhs) - - return tvm.tir.ProducerStore(buf, value, args) - - def visit_Assign(self, node): - rhs = self.visit(node.value) - if isinstance(rhs, Operation): - rmap = {} - _internal_assert( - len(node.targets) == rhs.num_outputs, "Unable to detuple the outs to targets" - ) - for i in range(rhs.num_outputs): - _internal_assert( - isinstance(node.targets[i], ast.Name), - "You should bind a pure name to the tensors", - ) - self.add_symbol(node.targets[i].id, Symbol.GlobalBuffer, rhs.output(i)) - rmap[rhs.outputs[i].op] = rhs.output(i) - return utils.replace_io(rhs.body, rmap) - - _internal_assert(len(node.targets) == 1, "So far only one-valued assignment is supported!") - lhs = node.targets[0] - if isinstance(rhs, _expr.PrimExpr): - rhs = self.analyzer.simplify(rhs) - if isinstance(lhs, ast.Name): - # TODO: support defined intermediate buffer later - lhs_ = lhs - lhs = lhs.id - if lhs in self.symbols.keys(): - ty, _ = self.symbols[lhs] - _internal_assert(ty != Symbol.LoopVar, "Loop variable cannot be overwritten!") - decl, _, rw = self.usage[lhs] - if decl == lhs_: - _internal_assert( - lhs not in self.symbols.keys(), - "This value should not be defined before this point!", - ) - if isinstance(rhs, tuple): - shape, dtype, scope = rhs - ph = tvm.te.placeholder(shape, dtype=dtype, name=lhs) - self.add_symbol(lhs, getattr(Symbol, scope.title() + "Buffer"), ph) - if scope == "output": - self.outputs.append(lhs) - return utils.make_nop() - if isinstance(rhs, utils.halide_imm_types) and ast.Store not in rw: - self.add_symbol(lhs, Symbol.ConstVar, rhs) - else: - _internal_assert( - self.device == 0, - "Single variable not supported in devices' side!\n" - + "If you are using GPU, please allocate a 'local' spad " - + "outside the bind body", - ) - ph = tvm.te.placeholder((1,), dtype=rhs.dtype, name=lhs) - self.add_symbol(lhs, Symbol.BufferVar, ph) - lhs = self.visit(lhs_) - if lhs is not None: - buf, args = lhs - return tvm.tir.ProducerStore(buf, rhs, args) - return utils.make_nop() - - lhs, args = self.visit(lhs) - _internal_assert( - isinstance(lhs, Tensor), "An array access's LHS is expected to be a expr.Call!" - ) - res = tvm.tir.ProducerStore(lhs, rhs, args) - return res - - def visit_Index(self, node): - if isinstance(node.value, ast.Tuple): - return self.visit(node.value) - return [self.visit(node.value)] - - def visit_Attribute(self, node): - buf = self.visit(node.value) - return getattr(buf, node.attr) - - def visit_Subscript(self, node): - args = self.visit(node.slice) - if sys.version_info >= (3, 9): - if not isinstance(node.slice, ast.Tuple): - args = [args] - - arr = self.visit(node.value) - if isinstance(arr, (Array, list, tuple)): - for i in args: - if isinstance(i, numbers.Integral): - arr = arr[i] - else: - _internal_assert( - isinstance(i, (_expr.IntImm,)), "All indices are supposed to be constants" - ) - arr = arr[i.value] - return arr - if isinstance(node.ctx, ast.Load): - return tvm.tir.ProducerLoad(arr, args) - return arr, args - - def visit_With(self, node): - if sys.version_info[0] < 3: - context = node.context_expr - option = node.optional_vars - else: - _internal_assert(len(node.items) == 1, "Only one with element is supported so far!") - context = node.items[0].context_expr - option = node.items[0].optional_vars - _internal_assert(isinstance(context, ast.Call), "The object must be a Python func call!") - _internal_assert(isinstance(option, ast.Name), "The object after 'as' must be an id!") - self.annotation[option.id] = context.func.id - return visit_list_to_block(self.visit, node.body) - - def visit_If(self, node): - cond = self.analyzer.simplify(self.visit(node.test)) - - # Return no IfThenElse if proven - if isinstance(cond, _expr.IntImm): - if cond.value: - return visit_list_to_block(self.visit, node.body) - if node.orelse: - return visit_list_to_block(self.visit, node.orelse) - return utils.make_nop() - - if_body = visit_list_to_block(self.visit, node.body) - - if node.orelse: - else_body = visit_list_to_block(self.visit, node.orelse) - else: - else_body = None - return tvm.tir.IfThenElse(cond, if_body, else_body) - - def visit_IfExp(self, node): - cond = self.visit(node.test) - if_body = self.visit(node.body) - else_body = self.visit(node.orelse) - return tvm.tir.Select(cond, if_body, else_body) - - def visit_Compare(self, node): - _internal_assert(len(node.ops) == len(node.comparators), "#compare ops != #comparators") - ops = [self.visit(node.left)] - ops += [self.visit(i) for i in node.comparators] - res = [] - for i in range(len(node.ops)): - lhs = ops[i] - rhs = ops[i + 1] - res.append(HybridParser._binop_maker[type(node.ops[i])](lhs, rhs)) - return _all(*res) - - def visit_BoolOp(self, node): - n = len(node.values) - if n == 1: - _internal_assert(isinstance(node.op, ast.Not), "Unary is supposed to be not!") - return operator.not_(self.visit(node.values[0])) - _internal_assert(isinstance(node.op, (ast.And, ast.Or)), "Binary is supposed to be and/or!") - values = [self.visit(i) for i in node.values] - return HybridParser._binop_maker[type(node.op)](*values) - - def visit_UnaryOp(self, node): - operand = self.visit(node.operand) - return HybridParser._unaryop_maker[type(node.op)](operand) - - def visit_BinOp(self, node): - lhs = self.visit(node.left) - rhs = self.visit(node.right) - return HybridParser._binop_maker[type(node.op)](lhs, rhs) - - def visit_Call(self, node): - # Yet, no function pointer supported - _internal_assert( - isinstance(node.func, ast.Name), "Only id-function function call is supported so far!" - ) - - func_id = node.func.id - args = [self.visit(i) for i in node.args] - # Intrinsics' - if hasattr(calls, func_id): - return getattr(calls, func_id)(func_id, args) - # Contexts' - _internal_assert( - func_id in self.symbols.keys(), - f"The function called ({func_id}) is not in the context either!", - ) - ty, entry = self.symbols[func_id] - _internal_assert(ty is Symbol.Callable, "Are you sure what you call is a function?!") - outs = entry(*args) - op = outs.op if isinstance(outs, Tensor) else outs[0].op - return op - - def visit_For(self, node): - iter_var, low, ext, kind = self.visit(node.iter) - _internal_assert( - isinstance(node.target, ast.Name), "The loop iterator should be a variable!" - ) - - _name = node.target.id - - if isinstance(kind, tuple): - low = self.analyzer.simplify(low) - ext = self.analyzer.simplify(ext) - _internal_assert( - isinstance(low, _expr.ConstExpr) and isinstance(ext, _expr.ConstExpr), - "Const range should start from a const " + "and iterate const times", - ) - - low, ext = low.value, ext.value - if ext > 114514: - logging.log( - logging.CRITICAL, "[Warning] Are you sure to unroll a large loop in Python?" - ) - - bodies = [] - for i in range(low, low + ext): - self.add_symbol(_name, Symbol.ConstLoopVar, i) - body = visit_list_to_block(self.visit, node.body) - body = self.wrap_up_realize(node, body) - bodies.append(body) - self.symbols.pop(_name) - return concat_list_to_block(bodies) - - if iter_var is None: - _internal_assert(kind is not None, "The loop iterating function parse error!") - if isinstance(ext, _expr.PrimExpr): - dtype = ext.dtype - elif isinstance(ext, int): - dtype = "int32" - else: - raise NotImplementedError(f"Unsupported type of ext: {type(ext)}") - offset = iter_var = tvm.te.var(_name, dtype=dtype) - if not tvm.tir.analysis.expr_deep_equal(low, tvm.runtime.const(0, "int32")): - offset = iter_var + low - self.add_symbol(_name, Symbol.LoopVar, offset) - _body = visit_list_to_block(self.visit, node.body) - else: - _internal_assert(kind is None, "The loop bind function parse error!") - self.add_symbol(_name, Symbol.ThreadBind, iter_var) - self.device += 1 - _body = visit_list_to_block(self.visit, node.body) - self.device -= 1 - - _body = self.wrap_up_realize(node, _body) - - if kind is None: - res = _body - else: - _internal_assert( - not isinstance(kind, tuple), "Micro expansion should be handled before!" - ) - res = tvm.tir.For(iter_var, tvm.runtime.const(0, "int32"), ext, kind, _body) - - self.symbols.pop(_name) - return res - - def visit_Return(self, node): - _internal_assert( - all(ty != Symbol.LoopVar for ty, _ in self.symbols.values()), - "Return should not be in a loop body!", - ) - ids = [] - if isinstance(node.value, ast.Name): - ids = [node.value.id] - else: - _internal_assert( - isinstance(node.value, ast.Tuple), - "You should return either a single tensor or a tuple", - ) - _internal_assert( - all(isinstance(i, ast.Name) for i in node.value.elts), "What do you return?" - ) - ids = [i.id for i in node.value.elts] - _internal_assert(len(set(ids)) == len(ids), "Duplicated tensors in the return tuples") - if len(ids) < len(self.outputs): - logging.log(logging.CRITICAL, "[Warning] Not all the output buffers returned!") - self.outputs = [self.symbols[i][1] for i in ids] - self.returned = True - return utils.make_nop() - - def visit_Tuple(self, node): - return tuple(self.visit(i) for i in node.elts) - - def visit_Str(self, node): - return node.s - - def visit_Assert(self, node): - test = self.visit(node.test) - mesg = tvm.runtime.convert(self.visit(node.msg)) - return tvm.tir.AssertStmt(test, mesg, utils.make_nop()) - - -def parse_python(src, args, symbols, closure_vars): - """The helper function of calling the AST visitor - - Parameters - ---------- - src : ast.node or str - If an ast.node, then directly lower it. - If a str, then parse it to ast and lower it. - - args : list of Tensors or Vars - The argument lists to the function. - It is NOT encouraged to write a function without arguments. - It is NOT encouraged to write a function with side effect. - - symbols : list of str - The symbol list of the global context of the function. - - closure_vars: dict - A dict of external name reference captured by this function. - - Returns - ------- - root : Stmt - The result Halide IR and the parser class instance. - """ - root = ast.parse(src) if isinstance(src, str) else src - _internal_assert(root, ast.AST) - var_usage = determine_variable_usage(root, args, symbols, closure_vars) - parser = HybridParser(args, var_usage, symbols, closure_vars) - parser.parsed_body = parser.visit(root) - _internal_assert(parser.returned, "No valid return found in the function body!") - return parser - - -def source_to_op(src, args, symbols, closure_vars): - """Another level of wrapper - - Parameters - ---------- - src : ast.node or str - If an ast.node, then directly lower it. - If a str, then parse it to ast and lower it. - - args : list of Tensors or Vars - The argument lists to the function. - It is NOT encouraged to write a function without arguments. - It is NOT encouraged to write a function with side effect. - - symbols : list of str - The symbol list of the global context of the function. - - closure_vars: dict - A dict of external name reference captured by this function. - - Returns - ------- - res : list of output tensors - The result of output tensors of the formed OpNode. - """ - parser = parse_python(src, args, symbols, closure_vars) - - input_tensors = [] - - def get_input_tensors(arg): - if isinstance(arg, Tensor): - input_tensors.append(arg) - elif isinstance(arg, Array): - for i in arg: - get_input_tensors(i) - - for i in args: - get_input_tensors(i) - op = tvm.te._ffi_api.HybridOp( - parser.func_name, "HybridOp", None, input_tensors, parser.outputs, parser.parsed_body - ) - res = [op.output(i) for i in range(len(parser.outputs))] - return res[0] if len(res) == 1 else res diff --git a/python/tvm/te/hybrid/preprocessor.py b/python/tvm/te/hybrid/preprocessor.py deleted file mode 100644 index 6af584060e9b..000000000000 --- a/python/tvm/te/hybrid/preprocessor.py +++ /dev/null @@ -1,120 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Determines the declaration, r/w status, and last use of each variable""" - -import ast -import sys -from .runtime import HYBRID_GLOBALS -from .utils import _internal_assert - - -class PyVariableUsage(ast.NodeVisitor): - """The vistor class to determine the declaration, r/w status, and last use of each variable""" - - # pylint: disable=invalid-name - # pylint: disable=missing-docstring - def __init__(self, args, symbols, closure_vars): - self.status = {} - self.scope_level = [] - self._args = {} - self.args = args - self.aug_assign_ = False - self.symbols = symbols - self.closure_vars = closure_vars - - def visit_FunctionDef(self, node): - self.scope_level.append(node) - _internal_assert( - len(node.args.args) == len(self.args), - "#arguments passed should be the same as #arguments defined", - ) - for idx, arg in enumerate(node.args.args): - _attr = "id" if sys.version_info[0] < 3 else "arg" # To make py2 and 3 compatible - self._args[getattr(arg, _attr)] = self.args[idx] - for i in node.body: - self.visit(i) - - def visit_For(self, node): - _internal_assert(isinstance(node.target, ast.Name), "For's iterator should be an id") - self.visit(node.iter) - self.scope_level.append(node) - for i in node.body: - self.visit(i) - self.scope_level.pop() - - def visit_Call(self, node): - # No function pointer supported so far - _internal_assert(isinstance(node.func, ast.Name), "Function call should be an id") - func_id = node.func.id - _internal_assert( - func_id - in list(HYBRID_GLOBALS.keys()) - + ["range", "max", "min", "len"] - + list(self.symbols.keys()), - "Function call id " + func_id + " not in intrinsics' list", - ) - for elem in node.args: - self.visit(elem) - - def visit_AugAssign(self, node): - self.aug_assign_ = True - self.generic_visit(node) - self.aug_assign_ = False - - def visit_Name(self, node): - # If it is True or False, we do not worry about it! - if sys.version_info[0] == 2 and node.id in ["True", "False"]: - return - # If it is from the argument list or loop variable, we do not worry about it! - if node.id in self._args.keys(): - return - fors = [loop.target.id for loop in self.scope_level if isinstance(loop, ast.For)] - if node.id in fors: - return - # The loop variable cannot be overwritten when iteration - _internal_assert( - not isinstance(node.ctx, ast.Store) or node.id not in fors, - "Iter var cannot be overwritten", - ) - - if node.id not in self.status.keys(): - # It is a captured value in closure - if node.id in self.closure_vars: - try: - ast.literal_eval(str(self.closure_vars[node.id])) - except ValueError: - raise ValueError("Only support capturing constant values in closure") - return - - _internal_assert(isinstance(node.ctx, ast.Store), f"Undeclared variable {node.id}") - if self.aug_assign_: - raise ValueError('"First store" cannot be an AugAssign') - self.status[node.id] = (node, self.scope_level[-1], set()) - else: - decl, loop, usage = self.status[node.id] - usage.add(type(node.ctx)) - _internal_assert( - loop in self.scope_level, f"{node.id} is used out of the scope it is defined!" - ) - self.status[node.id] = (decl, loop, usage) - - -def determine_variable_usage(root, args, symbols, closure_vars): - """The helper function for calling the dedicated visitor.""" - visitor = PyVariableUsage(args, symbols, closure_vars) - visitor.visit(root) - return visitor.status diff --git a/python/tvm/te/hybrid/runtime.py b/python/tvm/te/hybrid/runtime.py deleted file mode 100644 index 615bd7e43a7d..000000000000 --- a/python/tvm/te/hybrid/runtime.py +++ /dev/null @@ -1,175 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Intrinsics of TVM-Python Hybrid Script for Python emulation runtime""" - -import numpy -from tvm.target import Target - - -class bind(object): # pylint: disable=invalid-name - """GPU bind software emulataion runtime.""" - - def __init__(self, _, ext): - self.ext = ext - - def __iter__(self): - i = 0 - while i < self.ext: - yield i - i += 1 - - -def allocate(shape, dtype="float32", scope="global"): # pylint: disable=unused-argument - """Allocate a buffer with given shape - - Parameters - ---------- - shape: Tuple - The shape of the tensor to be allocated - dtype: string - The data type of the tensor - scope: string - The storage scope of the tensor - - Returns - ------- - tensor: numpy.array - The tensor allocated - """ - return numpy.zeros(shape).astype(dtype) - - -def rsqrt(x): - """ - Computes reciprocal of square root of x element-wise - - Parameters - ---------- - x: Tensor - - Returns - ------- - res: Tensor - The result of reciprocal of square root of x - """ - return numpy.ones_like(x) / numpy.sqrt(x) - - -def popcount(x): - """ - Count ones in the binary representation of number x - - Parameters - ---------- - x: Integer - The number to be counted - - Returns - ------- - cnt: Integer - The number of ones in the binary representation of number x - """ - cnt = 0 - while x: - x -= x & -x - cnt += 1 - return cnt - - -def sigmoid(x): - """ - Sigmoid function of x, aka 1/(1+exp(-x)). - - Parameters - ---------- - x: a real number - - Returns - ------- - res: a real number - The result of sigmoid function - """ - return 1 / (1 + numpy.exp(-x)) - - -def max_num_threads(allow_none=True): - """Get max number of threads for GPU targets.""" - return Target.current(allow_none).max_num_threads - - -def inf(dtype): - return numpy.iinfo(dtype).max - - -def ninf(dtype): - return numpy.iinfo(dtype).min - - -HYBRID_GLOBALS = { - "unroll": range, - "vectorize": range, - "parallel": range, - "const_range": range, - "bind": bind, - "allocate": allocate, - "output_tensor": allocate, - "sqrt": numpy.sqrt, - "rsqrt": rsqrt, - "log": numpy.log, - "tanh": numpy.tanh, - "power": numpy.power, - "exp": numpy.exp, - "sigmoid": sigmoid, - "popcount": popcount, - "round": round, - "likely": lambda cond: cond, - "uint8": numpy.uint8, - "uint16": numpy.uint16, - "uint32": numpy.uint32, - "uint64": numpy.uint64, - "int8": numpy.int8, - "int16": numpy.int16, - "int32": numpy.int32, - "int64": numpy.int64, - "float16": numpy.float16, - "float32": numpy.float32, - "float64": numpy.float64, - "ceil_div": lambda a, b: (a + b - 1) // b, - "max_num_threads": max_num_threads, - "inf": inf, - "ninf": inf, -} - - -def _enter_hybrid_runtime(func): - """Put hybrid runtime variables into the global scope""" - _globals = func.__globals__ - intersect = [] - for elem in list(HYBRID_GLOBALS.keys()): - if elem in _globals.keys(): - intersect.append((elem, _globals[elem])) - _globals[elem] = HYBRID_GLOBALS[elem] - return intersect - - -def _restore_runtime(func, intersect): - """Rollback the modification caused by hybrid runtime""" - _globals = func.__globals__ - for elem in list(HYBRID_GLOBALS.keys()): - _globals.pop(elem) - for k, v in intersect: - _globals[k] = v diff --git a/python/tvm/te/hybrid/utils.py b/python/tvm/te/hybrid/utils.py deleted file mode 100644 index a515938fa524..000000000000 --- a/python/tvm/te/hybrid/utils.py +++ /dev/null @@ -1,103 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=inconsistent-return-statements -"""Internal utilities for parsing Python subset to TIR""" - -import ast -import inspect -import logging -import sys -import numpy - -import tvm.runtime -from tvm._ffi.base import numeric_types -from tvm.ir.container import Array - -from tvm.tir import expr as _expr -from tvm.tir import stmt as _stmt -from tvm.te.tensor import Tensor - - -# pylint: disable=invalid-name -np_arg_types = (numpy.ndarray, *numeric_types) -tvm_arg_types = (Tensor, Array, _expr.Var, _expr.ConstExpr, *numeric_types, list, tuple, str) -halide_imm_types = (_expr.IntImm, _expr.FloatImm, *numeric_types) - - -def _internal_assert(cond, err): - """Simplify the code segment like if not XXX then raise an error""" - if not cond: - raise ValueError(err) - - -# Useful constants. In avoid of runtime dependences, we use function calls to return them. -def make_nop(): - """Returns a 'no operation' node in HalideIR.""" - return _stmt.Evaluate(tvm.runtime.const(0, dtype="int32")) - - -def is_docstring(node): - """Checks if a Python AST node is a docstring""" - return isinstance(node, ast.Expr) and isinstance(node.value, ast.Str) - - -def _pruned_source(func): - """Prune source code's extra leading spaces""" - try: - lines = inspect.getsource(func).split("\n") - leading_space = len(lines[0]) - len(lines[0].lstrip(" ")) - lines = [line[leading_space:] for line in lines] - return "\n".join(lines) - except IOError as err: - if sys.version_info[0] == 2 and str(err) == "could not get source code": - logging.log( - logging.CRITICAL, - "This module is not fully operated under Python2... " "Please move to Python3!", - ) - raise err - - -def replace_io(body, rmap): - """Replacing tensors usage according to the dict given""" - # pylint: disable=import-outside-toplevel - from tvm.tir import stmt_functor - - def replace(op): - if isinstance(op, _stmt.ProducerStore) and op.producer.op in rmap.keys(): - buf = rmap[op.producer.op] - return _stmt.ProducerStore(buf, op.value, op.indices) - if isinstance(op, _expr.ProducerLoad) and op.producer.op in rmap.keys(): - buf = rmap[op.producer.op] - return _expr.ProducerLoad(buf, op.indices) - return None - - return stmt_functor.ir_transform(body, None, replace, ["tir.ProducerStore", "tir.ProducerLoad"]) - - -def _is_tvm_arg_types(args): - """Determine a list of element is either a list of tvm arguments of a list of numpy arguments. - If neither is true, raise a value error.""" - if all(isinstance(elem, tvm_arg_types) for elem in args): - return True - elif all(isinstance(elem, np_arg_types) for elem in args): - return False - else: - raise ValueError( - f"Expected arguments to be entirely TVM types, " - f"or entirely numpy types, " - f"but received {[type(elem) for elem in args]}" - ) diff --git a/python/tvm/te/operation.py b/python/tvm/te/operation.py index 63a3ecd57b1c..a9681c6df040 100644 --- a/python/tvm/te/operation.py +++ b/python/tvm/te/operation.py @@ -620,3 +620,6 @@ def tir_matmul(a: T.handle, b: T.handle, c: T.handle) -> None: if not isinstance(ops, (list, tuple, Array)): ops = [ops] return _ffi_api.CreatePrimFunc(ops, index_dtype_override) + + +AXIS_SEPARATOR = tvm.tir.IndexMap.AXIS_SEPARATOR diff --git a/python/tvm/te/schedule.py b/python/tvm/te/schedule.py deleted file mode 100644 index 87a4eda728df..000000000000 --- a/python/tvm/te/schedule.py +++ /dev/null @@ -1,665 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=unused-import -"""The computation schedule api of TVM.""" -import collections -import inspect -from typing import Callable, List - -import tvm._ffi -from tvm._ffi.base import string_types -from tvm.ir import container as _container -from tvm.runtime import Object, convert -from tvm.tir import Buffer, IndexMap, IterVar, Var - -from . import _ffi_api -from . import tensor as _tensor - - -@tvm._ffi.register_object -class Split(Object): - """Split operation on axis.""" - - -@tvm._ffi.register_object -class Fuse(Object): - """Fuse operation on axis.""" - - -@tvm._ffi.register_object -class Singleton(Object): - """Singleton axis.""" - - -def create_schedule(ops): - """Create a schedule for list of ops - - Parameters - ---------- - ops : list of Operations - The source expression. - - Returns - ------- - sch : schedule.Schedule - The created schedule. - """ - if not isinstance(ops, (list, _container.Array)): - ops = [ops] - return _ffi_api.CreateSchedule(ops) - - -@tvm._ffi.register_object -class Schedule(Object): - """Schedule for all the stages.""" - - def __getitem__(self, k): - if isinstance(k, _tensor.Tensor): - k = k.op - if not isinstance(k, _tensor.Operation): - raise ValueError("Expect schedule key to be Tensor or Operation") - if k not in self.stage_map: - raise ValueError(f"Cannot find the operation {k} in schedule") - return self.stage_map[k] - - def normalize(self): - """Build a normalized schedule from the current schedule. - - Insert necessary rebase to make certain iter var to start from 0. - This is needed before bound inference and followup step. - - Returns - ------- - sch : Schedule - The normalized schedule. - """ - return _ffi_api.ScheduleNormalize(self) - - def create_group(self, outputs, inputs, include_inputs=False): - """Create stage group by giving output and input boundary. - - The operators between outputs and inputs are placed as member of group. - outputs are include in the group, while inputs are not included. - - Parameters - ---------- - outputs : list of Tensors - The outputs of the group. - - inputs : list of Tensors - The inputs of the group. - - include_inputs : boolean, optional - Whether include input operations in the group if they are used by outputs. - - Returns - ------- - group : Stage - A virtual stage represents the group, user can use compute_at to move - the attachment point of the group. - """ - if isinstance(outputs, _tensor.Tensor): - outputs = [outputs] - if isinstance(inputs, _tensor.Tensor): - inputs = [inputs] - return _ffi_api.ScheduleCreateGroup(self, outputs, inputs, include_inputs) - - def cache_read(self, tensor, scope, readers): - """Create a cache read of original tensor for readers. - - This will mutate the body of the readers. - A new cache stage will be created for the tensor. - Call this before doing any split/fuse schedule. - - Parameters - ---------- - tensor : Tensor - The tensor to be cached. - scope : str - The scope of cached - readers : list of Tensor or Operation - The readers to read the cache. - - Returns - ------- - cache : Tensor - The created cache tensor. - """ - if isinstance(readers, (_tensor.Tensor, _tensor.Operation)): - readers = [readers] - readers = [t.op if isinstance(t, _tensor.Tensor) else t for t in readers] - return _ffi_api.ScheduleCacheRead(self, tensor, scope, readers) - - def cache_write(self, tensor, scope): - """Create a cache write of original tensor, before storing into tensor. - - This will mutate the body of the tensor. - A new cache stage will created before feed into the tensor. - - This function can be used to support data layout transformation. - If there is a split/fuse/reorder on the data parallel axis of tensor - before cache_write is called. The intermediate cache stores - the data in the layout as the iteration order of leave axis. - The data will be transformed back to the original layout in the original tensor. - User can further call compute_inline to inline the original layout and keep - the data stored in the transformed layout. - - Parameters - ---------- - tensor : Tensor, list or tuple - The tensors to be feed to. All the tensors must be produced by one computeOp - scope : str - The scope of cached - - Returns - ------- - cache : Tensor - The created cache tensor. - """ - return _ffi_api.ScheduleCacheWrite(self, tensor, scope) - - def rfactor(self, tensor, axis, factor_axis=0): - """Factor a reduction axis in tensor's schedule to be an explicit axis. - - This will create a new stage that generated the new tensor with axis - as the first dimension. The tensor's body will be rewritten as a reduction - over the factored tensor. - - Parameters - ---------- - tensor : Tensor - The tensor to be factored. - axis : IterVar - The reduction axis in the schedule to be factored. - factor_axis : int - The position where the new axis is placed. - - Returns - ------- - tfactor : Tensor or Array of Tensor - The created factored tensor. - """ - factored = _ffi_api.ScheduleRFactor(self, tensor, axis, factor_axis) - return factored[0] if len(factored) == 1 else factored - - -@tvm._ffi.register_object -class Stage(Object): - """A Stage represents schedule for one operation.""" - - def split(self, parent, factor=None, nparts=None, disable_predication=False): - """Split the stage either by factor providing outer scope, or both - - Parameters - ---------- - parent : IterVar - The parent iter var. - - factor : Expr, optional - The splitting factor - - nparts : Expr, optional - The number of outer parts. - - disable_predication : bool, optional - If enabled, don't create a predicate for guarding the loop. This can - be useful when splitting with scalable factors that the schedule writer - knows are divisible by the loop bound. - - Warning: enabling this feature may result in incorrect code generation - if not used carefully. - - Returns - ------- - outer : IterVar - The outer variable of iteration. - - inner : IterVar - The inner variable of iteration. - """ - if nparts is not None: - if factor is not None: - raise ValueError("Do not need to provide both outer and nparts") - outer, inner = _ffi_api.StageSplitByNParts(self, parent, nparts, disable_predication) - else: - if factor is None: - raise ValueError("Either nparts or factor need to be provided") - outer, inner = _ffi_api.StageSplitByFactor(self, parent, factor, disable_predication) - return outer, inner - - def fuse(self, *args): - """Fuse multiple consecutive iteration variables into a single iteration variable. - - fused = fuse(...fuse(fuse(args[0], args[1]), args[2]),..., args[-1]) - The order is from outer to inner. - - Parameters - ---------- - args : list of IterVars - Itervars that proceeds each other - - Returns - ------- - fused : IterVar - The fused variable of iteration. - """ - fused = _ffi_api.StageFuse(self, args) - return fused - - def set_scope(self, scope): - """Set the thread scope of this stage - - Parameters - ---------- - scope : str - The thread scope of this stage - """ - return _ffi_api.StageSetScope(self, scope) - - def bind(self, ivar, thread_ivar): - """Bind ivar to thread index thread_ivar - - Parameters - ---------- - ivar : IterVar - The iteration to be binded to thread. - - thread_ivar : IterVar - The thread to be binded. - """ - _ffi_api.StageBind(self, ivar, thread_ivar) - - def env_threads(self, threads): - """Mark threads to be launched at the outer scope of composed op. - - Parameters - ---------- - threads : list of threads - The threads to be launched. - """ - if isinstance(threads, IterVar): - threads = [threads] - _ffi_api.StageEnvThreads(self, threads) - - def set_store_predicate(self, predicate): - """Set predicate under which store to the array can be performed. - - Use this when there are duplicated threads doing the same store and we only - need one of them to do the store. - - Parameters - ---------- - predicate : Expr - The guard condition fo store. - """ - _ffi_api.StageSetStorePredicate(self, predicate) - - def compute_at(self, parent, scope): - """Attach the stage at parent's scope - - Parameters - ---------- - parent : Stage - The parent stage - - scope : IterVar - The loop scope t be attached to. - """ - _ffi_api.StageComputeAt(self, parent, scope) - - def compute_inline(self): - """Mark stage as inline - - Parameters - ---------- - parent : Stage - The parent stage - """ - _ffi_api.StageComputeInline(self) - - def compute_root(self): - """Attach the stage at parent, and mark it as root - - Parameters - ---------- - parent : Stage - The parent stage - """ - _ffi_api.StageComputeRoot(self) - - def reorder(self, *args): - """reorder the arguments in the specified order. - - Parameters - ---------- - args : list of IterVar - The order to be ordered - """ - _ffi_api.StageReorder(self, args) - - def tile(self, x_parent, y_parent, x_factor, y_factor): - """Perform tiling on two dimensions - - The final loop order from outmost to inner most are - [x_outer, y_outer, x_inner, y_inner] - - Parameters - ---------- - x_parent : IterVar - The original x dimension - y_parent : IterVar - The original y dimension - x_factor : Expr - The stride factor on x axis - y_factor : Expr - The stride factor on y axis - - Returns - ------- - x_outer : IterVar - Outer axis of x dimension - y_outer : IterVar - Outer axis of y dimension - x_inner : IterVar - Inner axis of x dimension - p_y_inner : IterVar - Inner axis of y dimension - """ - x_outer, y_outer, x_inner, y_inner = _ffi_api.StageTile( - self, x_parent, y_parent, x_factor, y_factor - ) - return x_outer, y_outer, x_inner, y_inner - - def vectorize(self, var): - """Vectorize the iteration. - - Parameters - ---------- - var : IterVar - The iteration to be vectorize - """ - _ffi_api.StageVectorize(self, var) - - def tensorize(self, var, tensor_intrin): - """Tensorize the computation enclosed by var with tensor_intrin - - Parameters - ---------- - var : IterVar - The iteration boundary of tensorization. - - tensor_intrin : TensorIntrin - The tensor intrinsic used for computation. - """ - _ffi_api.StageTensorize(self, var, tensor_intrin) - - def unroll(self, var): - """Unroll the iteration. - - Parameters - ---------- - var : IterVar - The iteration to be unrolled. - """ - _ffi_api.StageUnroll(self, var) - - def parallel(self, var): - """Parallelize the iteration. - - Parameters - ---------- - var : IterVar - The iteration to be parallelized. - """ - _ffi_api.StageParallel(self, var) - - def pragma(self, var, pragma_type, pragma_value=None): - """Annotate the iteration with pragma - - This will translate to a pragma_scope surrounding - the corresponding loop generated. - Useful to support experimental features and extensions. - - Parameters - ---------- - var : IterVar - The iteration to be anotated - - pragma_type : str - The pragma string to be annotated - - pragma_value : Expr, optional - The pragma value to pass along the pragma - - Note - ---- - Most pragmas are advanced/experimental features - and may subject to change. List of supported pragmas: - - - **debug_skip_region** - - Force skip the region marked by the axis and turn it into no-op. - This is useful for debug purposes. - - - **parallel_launch_point** - - Specify to launch parallel threads outside the - specified iteration loop. By default the threads - launch at the point of parallel construct. - This pragma moves the launching point to even outer scope. - The threads are launched once and reused across multiple - parallel constructs as BSP style program. - - - **parallel_barrier_when_finish** - - Insert a synchronization barrier between working threads - after the specified loop iteration finishes. - - - **parallel_stride_pattern** - - Hint parallel loop to execute in strided pattern. - :code:`for (int i = task_id; i < end; i += num_task)` - - """ - if isinstance(pragma_value, string_types): - pragma_value = convert(pragma_value) - _ffi_api.StagePragma(self, var, pragma_type, pragma_value) - - def prefetch(self, tensor, var, offset): - """Prefetch the specified variable - - Parameters - ---------- - tensor : Tensor - The tensor to be prefetched - var : IterVar - The loop point at which the prefetching is applied - offset : Expr - The number of iterations to be prefetched before actual execution - """ - _ffi_api.StagePrefetch(self, tensor, var, offset) - - def storage_align(self, axis, factor, offset): - """Set alignment requirement for specific axis - - This ensures that stride[axis] == k * factor + offset for some k. - This is useful to set memory layout to for more friendly memory - access pattern. For example, we can set alignment to be - factor=2, offset=1 to avoid bank conflict for thread access on - higher dimension in GPU shared memory. - - Parameters - ---------- - axis : IterVar - The axis dimension to be aligned. - factor : int - The factor in alignment specification. - offset : int - The offset in the alignment specification. - """ - _ffi_api.StageStorageAlign(self, axis, factor, offset) - - def double_buffer(self): - """Compute the current stage via double buffering. - - This can only be applied to intermediate stage. - This will double the storage cost of the current stage. - Can be useful to hide load latency. - """ - _ffi_api.StageDoubleBuffer(self) - - def rolling_buffer(self): - """Compute the current stage via rolling buffering. - - This can only be applied to intermediate stage. - This will change the storage cost of the current stage. - """ - _ffi_api.StageRollingBuffer(self) - - def transform_layout(self, mapping_function: Callable[..., List[tvm.tir.PrimExpr]]): - """Defines the layout transformation for the current stage's tensor. - - The map from initial_indices to final_indices must be an - invertible affine transformation. This method may be called - more than once for a given tensor, in which case each - transformation is applied sequentially. - - If the stage is a ComputeOp, then the iteration order of the - compute stage is rewritten to be a row-major traversal of the - tensor, and the new loop iteration variables are returned. - For all other stages, the loop iteration order is unmodified, - and the return value is None. - - Parameters - ---------- - mapping_function : Callable[..., List[tvm.tir.PrimExpr]] - - A callable that accepts N arguments of type tvm.tir.Var, - and outputs a list of PrimExpr. The input arguments - represent the location of a value in the current stage's - tensor, using the pre-transformation layout. The return - value of the function gives the location of that value in - the current stage's tensor, using the post-transformation - layout. - - Returns - ------- - new_iter_vars : Optional[List[tvm.tir.IterVar]] - - If the stage is a ComputeOp, then the return will be the - updated loop iteration variables over the data array, in - the same order as the output values from the - `mapping_function`. - - Otherwise, the return value is None. - - Examples - -------- - .. code-block:: python - - # ``A`` is a tensor whose compute definition is in NHWC - # format, and should be transformed into NCHWc format. - - s[A].transform_layout( - lambda n,h,w,c: [n, c//4, h, w, c%4] - ) - - - .. code-block:: python - - # ``A`` is a tensor whose compute definition is in an - # arbitrary format, and should be transformed such that - # the last index is split, with the slower-changing index - # of the split placed at the slowest changing dimension. - - s[A].transform_layout( - lambda *indices, i: [i//4, *indices, i%4] - ) - - .. code-block:: python - - # ``B`` is a tensor defined by te.compute to be a copy of - # ``A`, and should be transformed such that ``B``'s layout - # is a transpose of ``A``'s layout. The loop iteration - # that computes ``B`` will correspond to ``B``'s memory - # layout. - - A = te.placeholder([n,m]) - B = te.compute(A.shape, lambda i,j: A[i,j]) - s = te.create_schedule(B.op) - - s[B].transform_layout(lambda i,j: [j,i]) - - """ - - ndim = len(self.op.output(0).shape) - index_map, axis_separators = IndexMap.from_func_with_separators( - mapping_function, ndim=ndim, index_dtype="int32" - ) - - new_iter_vars = _ffi_api.StageTransformLayout( - self, index_map.initial_indices, index_map.final_indices - ) - _ffi_api.StageSetAxisSeparators(self, axis_separators) - - return new_iter_vars or None - - -@tvm._ffi.register_object -class SpecializedCondition(Object): - - """Specialized condition to enable op specialization.""" - - def __init__(self, conditions): - """Create a specialized condition. - - .. note:: - Conditions are represented in conjunctive joint form (CNF). - Each condition should be a simple expression, e.g., n > 16, - m % 8 == 0, etc., where n, m are tvm.Var that represents a - dimension in the tensor shape. - - Parameters - ---------- - conditions : List of tvm.Expr - List of conditions in conjunctive joint form (CNF). - """ - if not isinstance(conditions, (list, _container.Array)): - conditions = [conditions] - self.__init_handle_by_constructor__(_ffi_api.CreateSpecializedCondition, conditions) - - @staticmethod - def current(): - """Returns the current specialized condition""" - return _ffi_api.GetCurrentSpecialization() - - def __enter__(self): - _ffi_api.EnterSpecializationScope(self) - return self - - def __exit__(self, ptype, value, trace): - _ffi_api.ExitSpecializationScope(self) - - -# Sentinel value used to indicate which groups of pre-flattening axes -# should be used to post-flattening axes. Moved from -# te.AXIS_SEPARATOR to tir.IndexMap.AXIS_SEPARATOR for general use, -# maintained here for backwards compatibility. -AXIS_SEPARATOR = IndexMap.AXIS_SEPARATOR - - -tvm._ffi._init_api("schedule", __name__) diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py index 930667242e29..53ab9d0b5b59 100644 --- a/python/tvm/te/tensor.py +++ b/python/tvm/te/tensor.py @@ -190,13 +190,3 @@ def scan_axis(self): @tvm._ffi.register_object class ExternOp(Operation): """External operation.""" - - -@tvm._ffi.register_object -class HybridOp(Operation): - """Hybrid operation.""" - - @property - def axis(self): - """Represent the IterVar axis, also defined when it is a HybridOp""" - return self.__getattr__("axis") diff --git a/python/tvm/te/tensor_intrin.py b/python/tvm/te/tensor_intrin.py deleted file mode 100644 index ff633af02d13..000000000000 --- a/python/tvm/te/tensor_intrin.py +++ /dev/null @@ -1,146 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Tensor intrinsics""" -import tvm._ffi -import tvm.tir - -from tvm.runtime import Object, convert -from tvm.ir import Range -from .tensor import PlaceholderOp - -from . import tensor as _tensor -from . import _ffi_api - - -def _get_region(tslice): - region = [] - for idx in tslice.indices: - if isinstance(idx, slice): - assert idx.step is None - region.append(Range(idx.start, idx.stop)) - else: - if isinstance(idx, tvm.tir.IterVar): - begin = idx.var - else: - begin = idx - region.append(Range.from_min_extent(begin, 1)) - return region - - -@tvm._ffi.register_object -class TensorIntrin(Object): - """Tensor intrinsic functions for certain computation. - - See Also - -------- - decl_tensor_intrin: Construct a TensorIntrin - """ - - def __call__(self, *args, **kwargs): - tensors = [x.tensor for x in args if isinstance(x, _tensor.TensorSlice)] - scalar_inputs = [x for x in args if not isinstance(x, _tensor.TensorSlice)] - regions = [_get_region(x) for x in args if isinstance(x, _tensor.TensorSlice)] - reduce_axis = [] - if "reduce_axis" in kwargs: - reduce_axis = kwargs["reduce_axis"] - if not isinstance(reduce_axis, (list, tuple)): - reduce_axis = [reduce_axis] - reduce_axis = convert(reduce_axis) - if scalar_inputs: - scalar_inputs = convert(scalar_inputs) - return _ffi_api.TensorIntrinCall(self, tensors, regions, reduce_axis, scalar_inputs) - - -def decl_tensor_intrin( - op, fcompute, name="tensor_intrin", binds=None, scalar_params=None, default_buffer_params=None -): - """Declare a tensor intrinsic function. - - Parameters - ---------- - op: Operation - The symbolic description of the intrinsic operation - - fcompute: lambda function of inputs, outputs-> stmt - Specifies the IR statement to do the computation. - See the following note for function signature of fcompute - - .. note:: - **Parameters** - - - **ins** (list of :any:`tvm.tir.Buffer`) - Placeholder for each inputs - - **outs** (list of :any:`tvm.tir.Buffer`) - Placeholder for each outputs - - **Returns** - - - **stmt** (:any:`tvm.tir.Stmt`, or tuple of three stmts) - - If a single stmt is returned, it represents the body - - If tuple of three stmts are returned they corresponds to body, - reduce_init, reduce_update - - name: str, optional - The name of the intrinsic. - - binds: dict of :any:`Tensor` to :any:`tvm.tir.Buffer`, optional - Dictionary that maps the Tensor to Buffer which specified the data layout - requirement of the function. By default, a new compact buffer is created - for each tensor in the argument. - - scalar_params: a list of variables used by op, whose values will be passed - as scalar_inputs when the tensor intrinsic is called. - - default_buffer_params: Optional[dict] - Dictionary of buffer arguments to be passed when constructing a buffer. - - Returns - ------- - intrin: TensorIntrin - A TensorIntrin that can be used in tensorize schedule. - """ - if not isinstance(op, _tensor.Operation): - raise TypeError("expect Operation") - inputs = op.input_tensors - binds = binds if binds else {} - tensors = list(inputs) - for i in range(op.num_outputs): - tensors.append(op.output(i)) - - binds_list = [] - for t in inputs: - if not isinstance(t.op, PlaceholderOp): - raise ValueError("Do not yet support composition op") - - default_buffer_params = {} if default_buffer_params is None else default_buffer_params - for t in tensors: - buf = ( - binds[t] - if t in binds - else tvm.tir.decl_buffer(t.shape, t.dtype, t.op.name, **default_buffer_params) - ) - binds_list.append(buf) - - if scalar_params: - body = fcompute(binds_list[: len(inputs)], binds_list[len(inputs) :], scalar_params) - else: - body = fcompute(binds_list[: len(inputs)], binds_list[len(inputs) :]) - scalar_params = [] - if isinstance(body, (tvm.tir.PrimExpr, tvm.tir.Stmt)): - body = [body] - body = [tvm.tir.Evaluate(x) if isinstance(x, tvm.tir.PrimExpr) else x for x in body] - if len(body) < 3: - body += [None] * (3 - len(body)) - return _ffi_api.TensorIntrin(name, op, inputs, binds_list, scalar_params, *body) diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index 8df32c810543..b3123a20d3e9 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -327,8 +327,7 @@ def _compute_body(*us): A = tvm.te.compute([r.extent.value for v, r in vranges.items()], _compute_body) args = [tvm.nd.empty(A.shape, A.dtype)] - sch = tvm.te.create_schedule(A.op) - mod = tvm.build(sch, [A]) + mod = tvm.build(tvm.IRModule.from_expr(tvm.te.create_prim_func([A]))) mod(*args) return args[0].numpy() diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py index 1109cc3d66d6..72c2a40fedd2 100644 --- a/python/tvm/tir/buffer.py +++ b/python/tvm/tir/buffer.py @@ -304,29 +304,6 @@ def decl_buffer( buffer : tvm.tir.Buffer The created buffer - Example - ------- - Here's an example of how broadcast buffer can be used to define a symbolic broadcast operation, - - .. code-block:: python - - m0, m1, m2 = te.var("m0"), te.var("m1"), te.var("m2") - n0, n1, n2 = te.var("n0"), te.var("n1"), te.var("n2") - o0, o1, o2 = te.var("o0"), te.var("o1"), te.var("o2") - A = te.placeholder((m0, m1, m2), name='A') - B = te.placeholder((n0, n1, n2), name='B') - C = te.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name='C') - Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast") - Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast") - s = te.create_schedule(C.op) - fadd = tvm.build(s, [A, B, C], target='llvm', name='bcast_add', binds={A:Ab, B:Bb}) - dev = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(2, 1, 3)).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev) - fadd(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - Note ---- Buffer data structure reflects the DLTensor structure in dlpack. diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py index 3588c04d8fa2..1de6941c9923 100644 --- a/python/tvm/topi/__init__.py +++ b/python/tvm/topi/__init__.py @@ -39,9 +39,7 @@ from .sort import * from .scatter import * from .scatter_elements import * -from .sparse_fill_empty_rows import * from .sparse_reshape import * -from .argwhere import * from .scan import * from .einsum import * from .unique import * @@ -49,9 +47,7 @@ from .signal import * from . import nn from . import utils -from . import vision from . import image -from . import random from . import gpu # error reporting diff --git a/python/tvm/topi/argwhere.py b/python/tvm/topi/argwhere.py deleted file mode 100644 index c2b658a4e92f..000000000000 --- a/python/tvm/topi/argwhere.py +++ /dev/null @@ -1,197 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks -"""Argwhere operator""" -import tvm -from tvm.te import hybrid - - -@hybrid.script -def hybrid_argwhere_1d(output_shape, condition): - """Find the indices of elements of a 1-D tensor that are non-zero. - - Parameters - ---------- - condition : tvm.te.Tensor - 1-D tensor with boolean values. - - Returns - ------- - out : tvm.te.Tensor - Indices of non-zero elements. - """ - a = output_tensor(output_shape, "int32") - a1 = condition.shape[0] - valid_index = 0 - for i1 in range(a1): - if condition[i1] != 0: - a[valid_index, 0] = i1 - valid_index += 1 - return a - - -@hybrid.script -def hybrid_argwhere_2d(output_shape, condition): - """Find the indices of elements of a 2-D tensor that are non-zero. - - Parameters - ---------- - condition : tvm.te.Tensor - 2-D tensor with boolean values. - - Returns - ------- - out : tvm.te.Tensor - Indices of non-zero elements. - """ - a = output_tensor(output_shape, "int32") - a1 = condition.shape[0] - a2 = condition.shape[1] - valid_index = 0 - for i1 in range(a1): - for i2 in range(a2): - if condition[i1, i2] != 0: - a[valid_index, 0] = i1 - a[valid_index, 1] = i2 - valid_index += 1 - return a - - -@hybrid.script -def hybrid_argwhere_3d(output_shape, condition): - """Find the indices of elements of a 3-D tensor that are non-zero. - - Parameters - ---------- - condition : tvm.te.Tensor - 3-D tensor with boolean values. - - Returns - ------- - out : tvm.te.Tensor - Indices of non-zero elements. - """ - a = output_tensor(output_shape, "int32") - a1 = condition.shape[0] - a2 = condition.shape[1] - a3 = condition.shape[2] - valid_index = 0 - for i1 in range(a1): - for i2 in range(a2): - for i3 in range(a3): - if condition[i1, i2, i3] != 0: - a[valid_index, 0] = i1 - a[valid_index, 1] = i2 - a[valid_index, 2] = i3 - valid_index += 1 - return a - - -@hybrid.script -def hybrid_argwhere_4d(output_shape, condition): - """Find the indices of elements of a 4-D tensor that are non-zero. - - Parameters - ---------- - condition : tvm.te.Tensor - 4-D tensor with boolean values. - - Returns - ------- - out : tvm.te.Tensor - Indices of non-zero elements. - """ - a = output_tensor(output_shape, "int32") - a1 = condition.shape[0] - a2 = condition.shape[1] - a3 = condition.shape[2] - a4 = condition.shape[3] - valid_index = 0 - for i1 in range(a1): - for i2 in range(a2): - for i3 in range(a3): - for i4 in range(a4): - if condition[i1, i2, i3, i4] != 0: - a[valid_index, 0] = i1 - a[valid_index, 1] = i2 - a[valid_index, 2] = i3 - a[valid_index, 3] = i4 - valid_index += 1 - return a - - -@hybrid.script -def hybrid_argwhere_5d(output_shape, condition): - """Find the indices of elements of a 5-D tensor that are non-zero. - - Parameters - ---------- - condition : tvm.te.Tensor - 5-D tensor with boolean values. - - Returns - ------- - out : tvm.te.Tensor - Indices of non-zero elements. - """ - a = output_tensor(output_shape, "int32") - a1 = condition.shape[0] - a2 = condition.shape[1] - a3 = condition.shape[2] - a4 = condition.shape[3] - a5 = condition.shape[4] - valid_index = 0 - for i1 in range(a1): - for i2 in range(a2): - for i3 in range(a3): - for i4 in range(a4): - for i5 in range(a5): - if condition[i1, i2, i3, i4, i5] != 0: - a[valid_index, 0] = i1 - a[valid_index, 1] = i2 - a[valid_index, 2] = i3 - a[valid_index, 3] = i4 - a[valid_index, 4] = i5 - valid_index += 1 - return a - - -@tvm.target.generic_func -def argwhere(output_shape, condition): - """Find the indices of elements of a tensor that are non-zero. - - Parameters - ---------- - condition : tvm.te.Tensor - Tensor with boolean values. - - Returns - ------- - out : tvm.te.Tensor - Indices of non-zero elements. - """ - if len(condition.shape) == 1: - return hybrid_argwhere_1d(output_shape.shape, condition) - if len(condition.shape) == 2: - return hybrid_argwhere_2d(output_shape.shape, condition) - if len(condition.shape) == 3: - return hybrid_argwhere_3d(output_shape.shape, condition) - if len(condition.shape) == 4: - return hybrid_argwhere_4d(output_shape.shape, condition) - if len(condition.shape) == 5: - return hybrid_argwhere_5d(output_shape.shape, condition) - raise ValueError("Does not support rank higher than 5 in argwhere") diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 5ee625577e38..e145add5f01b 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -615,68 +615,6 @@ def conv2d_NCHWc_int8( ) -def conv2d_gemm_weight_transform(kernel, tile_N, tile_K, use_scalable_vectors=False, use_sme=False): - """Weight transformation for winograd - - Parameters - ---------- - kernel: Tensor - The raw kernel tensor with layout "NHWC". - tile_N: int - Tile size across N axis of the weight transformation for ConvGemm. (N = OC) - tile_K: int - Tile size across K axis of the weight transformation for ConvGemm. (K = KW * KH * IC) - use_scalable_vectors : bool - determines if operations on scalable vectors are expected - use_sme : bool - determines if SME operations on scalable vectors are expected - - Returns - ------- - output : tvm.te.Tensor - 2-D with shape [CI*KH*KW,CO] - """ - KH, KW, IC, OC = get_const_tuple(kernel.shape) - K = KH * KW * IC - N = OC - - kernel_flat = te.compute( - (K, N), lambda x, y: kernel[(x // IC) // KW, (x // IC) % KW, x % IC, y], "weight_flatten" - ) - - pad_N, pad_K = tvm.topi.arm_cpu.arm_utils.get_conv2d_weights_padding(N, K, tile_N, tile_K) - - N_padded = N + pad_N - K_padded = K + pad_K - - if pad_K != 0 or pad_N != 0: - kernel_flat = pad( - kernel_flat, pad_before=(0, 0), pad_after=(pad_K, pad_N), name="weight_padding" - ) - - if use_sme and kernel.dtype == "float16": - return te.compute( - (N_padded, K_padded), lambda x, y: kernel_flat[y, x], name="weight_transpose" - ) - - if use_scalable_vectors or use_sme: - return kernel_flat - - if kernel.dtype in ["int8", "uint8"]: - B_inter_t = te.compute( - (N_padded // tile_N, K_padded // tile_K, tile_N, tile_K), - lambda x, y, z, w: kernel_flat[w + tile_K * y, z + tile_N * x], - name="weight_block_reshape", - ) - else: - B_inter_t = te.compute( - (N_padded // tile_N, K_padded // tile_K, tile_K, tile_N), - lambda x, y, z, w: kernel_flat[z + tile_K * y, w + tile_N * x], - name="weight_block_reshape", - ) - return B_inter_t - - def conv2d_winograd_weight_transform(kernel, tile_size): """Weight transformation for winograd @@ -712,29 +650,6 @@ def conv2d_winograd_weight_transform(kernel, tile_size): ) -def conv2d_winograd_nnpack_weight_transform(kernel, convolution_algorithm, out_dtype): - """Weight transformation for winograd - - Parameters - ---------- - kernel: Tensor - The raw kernel tensor with layout "NCHW". Only 3x3 kernel is supported for now. - convolution_algorithm: int - The convolution algorithm for Winograd NNPACK. - - Returns - ------- - output : tvm.te.Tensor - 4-D with shape [alpha, alpha, CO, CI] - """ - # pylint: disable=import-outside-toplevel - from tvm.contrib import nnpack - - return nnpack.convolution_inference_weight_transform( - kernel, algorithm=convolution_algorithm, dtype=out_dtype - ) - - def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtype=None): """Group convolution operator in NCHW layout. diff --git a/python/tvm/topi/random/__init__.py b/python/tvm/topi/random/__init__.py deleted file mode 100644 index ee8d1d6385b7..000000000000 --- a/python/tvm/topi/random/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# pylint: disable=wildcard-import -"""Pseudorandom generator kernels and operators.""" -from __future__ import absolute_import - -from .kernel import * diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py deleted file mode 100644 index 464ea9634ab5..000000000000 --- a/python/tvm/topi/random/kernel.py +++ /dev/null @@ -1,657 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Pseudorandom number kernels.""" -import math -import numpy as np - -import tvm -import tvm.topi - -from ... import tir -from ...tir import ir_builder - - -# Threefry PRNG with splitting based on -# - J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, -# 2, 3," SC '11: Proceedings of 2011 International Conference for High Performance Computing, -# Networking, Storage and Analysis, Seattle, WA, 2011, pp. 1-12, doi: 10.1145/2063384.2063405. -# - Claessen, K. ; Palka, M. (2013) "Splittable Pseudorandom Number Generators using Cryptographic -# Hashing". Proceedings of Haskell Symposium 2013 pp. 47-58. MLA -# - Ferguson, Niels, et al. "The Skein hash function family." Submission to NIST (round 3) 7.7.5 -# (2010): 3. - - -# Threefry is a counter based PRNG: given a unique input, it generates a unique random number. As -# there is no state to maintain, we can apply it to a sequence of numbers (0..N) to generate a -# sequence of random numbers in parallel. In order to make the PRNG splittable (that is we can -# generate a sequence of random numbers in one place, and another sequence in another), we add a -# path and key in addition to the counter. The path allows us to encode a sequence of splits (a 0 in -# the path indicates the left result of a split, a 1 indicates the right). To avoid continuously -# growing the path, we can compress an existing path into the key portion of the generator by -# hashing the current key, path, and counter to create the new key (this same technique is used if -# we run out of room for the counter). They key is initialized with a unique initial state. -# -# Random numbers are generated by applying the Threefry hash to the current key, path, and counter. - -# This module use encoding e4 from the appendix of "Splittable Pseudorandom Number Generators using -# Cryptographic Hashing" (confusingly, the definition in the paper uses e3 to define the encoding -# function). This encoding uses a 10 element uint64 tensor where each byte means the following: - -# .. code-block: - -# gen: -# words: 0 1 2 3 | 4 5 | 6 7 | 8 9 -# usage: key | path | counter | position of next step in path encoded in binary -# ex: 0b00010 -> next path entry goes one from the right - -# Right now, counter only uses the rightmost word. - -# Threefry rotation constants from the Skein paper ("The Skein Hash Function Family" -# https://www.schneier.com/wp-content/uploads/2015/01/skein.pdf) -_ROTATIONS = { - 4: [[14, 16], [52, 57], [23, 40], [5, 37], [25, 33], [46, 12], [58, 22], [32, 32]], - 8: [ - [46, 36, 19, 37], - [33, 27, 14, 42], - [17, 49, 36, 39], - [44, 9, 54, 56], - [39, 30, 34, 24], - [13, 50, 10, 17], - [25, 29, 39, 43], - [8, 35, 56, 22], - ], - 16: [ - [24, 13, 8, 47, 8, 17, 22, 37], - [38, 19, 10, 55, 49, 18, 23, 52], - [33, 4, 51, 13, 34, 41, 59, 17], - [5, 20, 48, 41, 47, 28, 16, 25], - [41, 9, 37, 31, 12, 47, 44, 30], - [16, 34, 56, 51, 4, 53, 42, 41], - [31, 44, 47, 46, 19, 42, 44, 25], - [9, 48, 35, 52, 23, 31, 37, 20], - ], -} - -# Threefry permutation constants from the Skein paper ("The Skein Hash Function Family" -# https://www.schneier.com/wp-content/uploads/2015/01/skein.pdf) -_PERMUTATIONS = { - 4: [0, 3, 2, 1], - 8: [2, 1, 4, 7, 6, 5, 0, 3], - 16: [0, 9, 2, 13, 6, 11, 4, 15, 10, 7, 12, 3, 14, 5, 8, 1], -} - - -def _threefry( - irb, key_buf, key_offset, counter_buf, counter_offset, out_buf, out_offset, out_shape -): - """IRBuilder code for running Threefry - - Parameters - ---------- - irb: IRBuilder - IRBuilder that this code will be generated for. - - key_buf: BufferVar - Buffer to read the key from. - - key_offset: number - Threefry will write to :code:`key_buf[key_offset:key_offset+4]` - - counter_buf: BufferVar - Buffer to read the counter from. - - counter_offset: number - Threefry will write to :code:`counter_buf[counter_offset:counter_offset+4]` - - out_buf: BufferVar - Buffer to read the counter from. - - out_offset: number - Threefry will write to :code:`out_buf[out_offset:out_offset+4*product(out_shape)]` - - out_shape: number - Determines the number of output states to generate. :code:`state[i]` will correspond to - counter+i. - """ - nrounds = 20 - nwords = 4 - iwidth = 64 - assert nrounds % 4 == 0 - assert nwords in [4, 8, 16] - - # The paper has constants for 32 bit threefry, but we keep the implementation simple by only - # using 64-bit words. - assert key_buf.dtype == "uint64", "threefry only supports 64-bit keys" - assert key_buf.dtype == counter_buf.dtype, "threefry key and counter must be the same dtype" - - def mix(a, b, rotation): - x = a + b # wrapping - y = x ^ ((b << rotation) | (b >> (iwidth - rotation))) - return [x, y] - - # temporary buffer for holding the results of _PERMUTATIONS - tmp = irb.allocate(out_buf.dtype, out_shape * nwords, name="tmp", scope="global") - tmp_offset = 0 - - # Initialize entire key. It is composed of the original key with one - # element appended. The appended element is the xor of all key words plus a - # constant. - full_key = irb.allocate("uint64", nwords + 1, name="full_key", scope="global") - for i in range(nwords): - full_key[i] = key_buf[key_offset + i] - # initial key constant, full_key[nwords] is equivalent to k_{N_W} in the Skein paper. - full_key[nwords] = tvm.tir.const(0x1BD11BDAA9FC1A22, dtype="uint64") - for i in range(nwords): - full_key[nwords] ^= key_buf[key_offset + i] - - with irb.for_range(0, out_shape, dtype="uint64", name="i") as i: - for j in range(nwords): - out_buf[out_offset + i * nwords + j] = counter_buf[counter_offset + j] + i - - def key_schedule(s, i): - # Threefry uses no tweak, so the key schedule is simple - if i == nwords - 1: - return full_key[(s + i) % (nwords + 1)] + tvm.tir.const(s, dtype="uint64") - return full_key[(s + i) % (nwords + 1)] - - with irb.for_range(0, out_shape, name="l") as l: # pylint: disable=invalid-name - for i in range(nrounds // 4): - for j in range(nwords): - out_buf[out_offset + l * nwords + j] += key_schedule(i, j) # wrapping - for k in range(4): - for j in range(nwords // 2): - ( - out_buf[out_offset + l * nwords + j * 2 + 0], - out_buf[out_offset + l * nwords + j * 2 + 1], - ) = mix( - out_buf[out_offset + l * nwords + j * 2 + 0], - out_buf[out_offset + l * nwords + j * 2 + 1], - _ROTATIONS[nwords][(i * 4 + k) % 8][j], - ) - for j in range(nwords): - tmp[tmp_offset + l * nwords + j] = out_buf[ - out_offset + l * nwords + _PERMUTATIONS[nwords][j] - ] - # number of rounds is even, so out always contains the result - (out_buf, tmp) = (tmp, out_buf) - (out_offset, tmp_offset) = (tmp_offset, out_offset) - - -def threefry_generate(gen, out_shape): - """Generate a series of random values - - Notes - ----- - This function uses the counter portion of the generator state to generate a series of random - numbers in parallel. Random number `i` is generated by applying Threefry to the current - generator state with the counter portion incremented by `i`. This means that each random number - is generated independently from each other random number, so we can compute them in parallel. - - If there is not enough room left in the counter to generate the desired shape of random values, - then a new generator is created by applying Threefry to the current key, path, and counter. - This new generator will have a reset counter. - - Warning - ------- - Threeyfry requires that unsigned integer arithmetic wraps on overflow. Currently TVM has no - guarantee of this, so threefry contains an internal assert to check wrapping behavior. This - assert may or may not run depending on your platform, so it is recommended you run - :py:func:`threefry_test_wrapping` to verify wrapping behavior. - - Parameters - ---------- - gen : Tensor[10, uint64] - Generator state. Can be create with :py:func:`tvm.relay.random.threefry_key`. This should - not be reused in another function, otherwise random numbers will be repeated. - - out_shape : Sequence[int] - Output shape of the random numbers. - - Returns - ------- - new_gen : Tensor[10, uint64] - The new generator state to be used in subsequent calls. - - rand : Tensor[out_shape, uint64] - Tensor of random numbers with shape `out_shape`. - """ - out_len = tir.const(1) - for s in out_shape: - out_len *= s - assert ( - out_len.value <= 2**64 - 1 - ), f"Can only generate up to 2^64 random numbers, but {out_len} were requested." - - def gen_ir(gen_ptr, out_gen_ptr, out_array_ptr): - irb = ir_builder.create() - gen = irb.buffer_ptr(gen_ptr) - out_gen = irb.buffer_ptr(out_gen_ptr) - out_array = irb.buffer_ptr(out_array_ptr) - - # Check that unsigned arithmetic wraps, as it is required to implement threefry correctly. - irb.emit( - tvm.tir.AssertStmt( - tvm.tir.const(0xFFFFFFFFFFFFFFFF, "uint64") + tvm.tir.const(1, "uint64") - == tvm.tir.const(0, "uint64"), - tvm.tir.StringImm( - "Unsigned integer arithmetic is not wrapping, but threefry requires wrapping." - ), - tvm.tir.Evaluate(0), - ) - ) - - # Create a temporary array to hold the generator state we will use to create the random - # numbers. We cannot use gen because we may need to update the key + path if there is not - # enough room in the counter. - tmp = irb.allocate(gen.dtype, 10, name="tmp", scope="global") - - # TODO(tkonolige): for now we only use the last word of the counter for counting. It is too - # much work to figure out how to do 128 bit addition. - - # Max value for counter should be 2**64-2 because we need to reserve a special value to - # indicate the counter is used up. - with irb.if_scope(gen[7] < tir.const(2**64 - 1, dtype=gen.dtype) - out_len): - for i in range(10): - tmp[i] = gen[i] - with irb.else_scope(): - # no room left in the counter, we have to change the path or key - with irb.if_scope(gen[8] == 0 and gen[9] == 0): - # out of room in the path, have to generate new key - - # The paper says the counter that we will be hashing should be a special value of - # all ones. We need to allocate some space for it because we cannot overwrite gen. - tmp_counter = irb.allocate(gen.dtype, 2, name="tmp_counter", scope="global") - tmp_counter[0] = tir.const(0xFFFFFFFFFFFFFFFF, dtype=gen.dtype) - tmp_counter[1] = tir.const(0xFFFFFFFFFFFFFFFF, dtype=gen.dtype) - _threefry(irb, gen, 0, tmp_counter, 0, tmp, 0, 1) - tmp[4] = tir.const(0, dtype=gen.dtype) # zero path, i.e. no path - tmp[5] = tir.const(0, dtype=gen.dtype) - tmp[6] = tir.const(0, dtype=gen.dtype) # zero counter - tmp[7] = tir.const(0, dtype=gen.dtype) - tmp[8] = tir.const(1 << 63, dtype=gen.dtype) # one in the leftmost position - tmp[9] = tir.const(0, dtype=gen.dtype) - with irb.else_scope(): - tmp[0] = gen[0] - tmp[1] = gen[1] - tmp[2] = gen[2] - tmp[3] = gen[3] - tmp[4] = gen[4] | gen[8] # add a 1 to the path - tmp[5] = gen[5] | gen[9] - tmp[6] = tir.const(0, dtype=gen.dtype) # zero counter - tmp[7] = tir.const(0, dtype=gen.dtype) - _shift_right(irb, gen[8], gen[9], tmp, 8, tmp, 9) - - # Compute random values - if out_len.value >= 4: - _threefry(irb, tmp, 0, tmp, 4, out_array, 0, out_len // 4) - if out_len.value % 4 != 0: - remaining = irb.allocate(gen.dtype, 4, name="remaining", scope="global") - tmp[7] = tmp[7] + tir.Cast(gen.dtype, out_len // 4 * 4) # increment counter - _threefry(irb, tmp, 0, tmp, 4, remaining, 0, 1) - with irb.for_range(0, out_len % 4, dtype="uint64", name="i") as i: - out_array[out_len // 4 * 4 + i] = remaining[i] - - # Update generator state - out_gen[0] = tmp[0] # key stays the same - out_gen[1] = tmp[1] - out_gen[2] = tmp[2] - out_gen[3] = tmp[3] - out_gen[4] = tmp[4] # path stays the same - out_gen[5] = tmp[5] - out_gen[6] = tir.const(0, dtype=gen.dtype) # unused, leave it as 0 - if out_len.value % 4 != 0: - # increment counter for the remaining - # as we will generate 4 random numbers for the remaining, increase 4 here. - # the main increment was done before the second _threefry. - out_gen[7] = tmp[7] + tir.Cast(gen.dtype, 4) - else: - out_gen[7] = tmp[7] + tir.Cast(gen.dtype, out_len) # increment counter - out_gen[8] = tmp[8] # path unchanged, so no update here - out_gen[9] = tmp[9] - - return irb.get() - - out_gen = tvm.tir.decl_buffer((10,), name="out_gen", dtype="uint64") - out_array = tvm.tir.decl_buffer(out_shape, name="out_array", dtype="uint64") - return tvm.te.extern( - [out_gen.shape, out_array.shape], - [gen], - lambda ins, outs: gen_ir(ins[0], outs[0], outs[1]), - out_buffers=[out_gen, out_array], - name="threefry_generate", - tag="threefry_generate", - ) - - -def _shift_right(irb, a, b, out_a, a_off, out_b, b_off): - """Binary shift a 128bit number composed of two 64 bit words right by one.""" - with irb.if_scope(a == 1): - out_a[a_off] = tir.const(0, dtype=a.dtype) - out_b[b_off] = tir.const(0x8000000000000000, dtype=a.dtype) - with irb.else_scope(): - with irb.if_scope(a == 0): - out_a[a_off] = tir.const(0, dtype=a.dtype) - out_b[b_off] = b >> 1 - with irb.else_scope(): - out_a[a_off] = a >> 1 - out_b[b_off] = tir.const(0, dtype=a.dtype) - - -def threefry_split(gen): - """Split a single generator state into two new ones - - Notes - ----- - The new generator is created by appending a one (for the right output) or a zero (for the left - output) to the end of the path portion of the generator If there is no longer and room in the - path, then we create a new key portion of the generator by applying Threefry to the old state, - path, and counter. i.e. :code:`new_key = threefry(old_key, [old_path, old_counter])`. This - resets the path portion of the new generator. - - Parameters - ---------- - gen : Tensor[10, uint64] - Generator state. Can be create with :py:func:`tvm.relay.random.threefry_key`. This should - not be reused in another function, otherwise random numbers will be repeated. - - Returns - ------- - out_gen_left : Tensor[10, uint64] - New generator state that is distinct from `out_gen_right`. - - out_gen_right : Tensor[10, uint64] - New generator state that is distinct from `out_gen_left`. - """ - - def gen_ir(gen_ptr, out_left_ptr, out_right_ptr): - irb = ir_builder.create() - gen = irb.buffer_ptr(gen_ptr) - out_left = irb.buffer_ptr(out_left_ptr) - out_right = irb.buffer_ptr(out_right_ptr) - - with irb.if_scope(gen[8] == 0 and gen[9] == 0): - # Generate new key because we have run out of room to extend the path - _threefry(irb, gen, 0, gen, 4, out_left, 0, 1) - out_left[4] = tir.const(0, dtype=gen.dtype) - out_left[5] = tir.const(0, dtype=gen.dtype) - out_left[6] = tir.const(0, dtype=gen.dtype) # counter gets zeroed - out_left[7] = tir.const(0, dtype=gen.dtype) # counter gets zeroed - out_left[8] = tir.const( - 1 << 62, dtype=gen.dtype - ) # one in the second from the leftmost position - out_left[9] = tir.const(0, dtype=gen.dtype) - - out_right[0] = out_left[0] - out_right[1] = out_left[1] - out_right[2] = out_left[2] - out_right[3] = out_left[3] - out_right[4] = tir.const(1 << 63, dtype=gen.dtype) # one in the leftmost position - out_right[5] = tir.const(0, dtype=gen.dtype) - out_right[6] = tir.const(0, dtype=gen.dtype) - out_right[7] = tir.const(0, dtype=gen.dtype) - out_right[8] = tir.const( - 1 << 62, dtype=gen.dtype - ) # one in the second from the leftmost position - out_right[9] = tir.const(0, dtype=gen.dtype) - with irb.else_scope(): - out_left[0] = gen[0] - out_left[1] = gen[1] - out_left[2] = gen[2] - out_left[3] = gen[3] - out_left[4] = gen[4] # adding a zero here, but its already zero padded - out_left[5] = gen[5] - out_left[6] = gen[6] - out_left[7] = gen[7] - # move path position over one bit - _shift_right(irb, gen[8], gen[9], out_left, 8, out_left, 9) - - out_right[0] = gen[0] - out_right[1] = gen[1] - out_right[2] = gen[2] - out_right[3] = gen[3] - out_right[4] = gen[4] | gen[8] # add a one to the path - out_right[5] = gen[5] | gen[9] - out_right[6] = gen[6] - out_right[7] = gen[7] - _shift_right(irb, gen[8], gen[9], out_right, 8, out_right, 9) - - return irb.get() - - out_left = tvm.tir.decl_buffer((10,), name="out_left", dtype="uint64") - out_right = tvm.tir.decl_buffer((10,), name="out_right", dtype="uint64") - return tvm.te.extern( - [out_left.shape, out_right.shape], - [gen], - lambda ins, outs: gen_ir(ins[0], outs[0], outs[1]), - out_buffers=[out_left, out_right], - name="threefry_split", - tag="threefry_split", - ) - - -def threefry_test_wrapping(target, device): - """Test that unsigned arithmetic wraps on overflow. - - Parameters - ---------- - target : tvm.target.Target - Target to run against - device : tvm.runtime.Device - Context to run the test on - - Returns - ------- - is_wrapping : bool - Whether or not unsigned integer arithmetic is wrapping for this target, context pair. True - indicates that threefry will work on this platform. - """ - if isinstance(target, str): - target = tvm.target.Target(target) - - def gen_ir(out_ptr): - irb = ir_builder.create() - out = irb.buffer_ptr(out_ptr) - if "gpu" in target.keys: - thread_x = tvm.te.thread_axis("threadIdx.x") - irb.scope_attr(thread_x, "thread_extent", 1) - out[0] = tvm.tir.const(0xFFFFFFFFFFFFFFFF, "uint64") + tvm.tir.const(1, "uint64") - return irb.get() - - out = tvm.tir.decl_buffer((1,), dtype="uint64") - f = tvm.te.extern( - [out.shape], [], lambda ins, outs: gen_ir(outs[0]), dtype="uint64", out_buffers=[out] - ) - s = tvm.te.create_schedule([f.op]) - out_ary = tvm.nd.array(np.ones((1,), "uint64"), device) - tvm.build(s, [f], target=target)(out_ary) - return out_ary.numpy()[0] == 0 - - -def uniform(gen, low, high, out_shape, out_dtype): - """Draw samples from a uniform distribution. - - Samples are uniformly distributed over the half-open interval [low, high) - (includes low, but excludes high). In other words, any value within the - given interval is equally likely to be drawn by uniform. - - Parameters - ---------- - gen : ThreefryKey - Generator state. Can be create with :py:func:`tvm.relay.threefry_key`. This should not be - reused in another function, otherwise random numbers will be repeated. - - low : Tensor[(), out_dtype] - Lower boundary of the output interval. All values generated will be - greater than or equal to low. - - high : Tensor[(), out_dtype] - Upper boundary of the output interval. All values generated will be - less than high. - - out_shape : Sequence[int] - Output shape of the random numbers. - - out_dtype : str - The output dtype. - - Returns - ------- - new_gen : ThreefryKey - New generator state that is distinct from `gen`. - - out : Tensor[out_shape, out_dtype] - Tensor of random numbers with shape `out_shape` and type `out_dtype`. - """ - new_gen, random_bits = threefry_generate(gen, out_shape) - assert out_dtype in ( - "float32", - "float64", - ), f"Only support float32 or float64 for now, got {out_dtype}" - if out_dtype == "float32": - random_dtype = "uint32" - nbits = 32 - nfraction = 23 - elif out_dtype == "float64": - random_dtype = "uint64" - nbits = 64 - nfraction = 52 - nexp = nbits - nfraction - 1 - random_bits = random_bits.astype(random_dtype) - - fraction = tvm.topi.right_shift( - random_bits, tvm.tir.const(nbits - nfraction, dtype=random_dtype) - ) - exponent = tvm.topi.left_shift( - tvm.topi.full(out_shape, random_dtype, (1 << (nexp - 1)) - 1), - tvm.tir.const(nfraction, dtype=random_dtype), - ) - mantissa = tvm.topi.bitwise_or(fraction, exponent).astype(random_dtype) - standard_uniform_values = tvm.topi.reinterpret(mantissa, out_dtype) - tvm.tir.const( - 1, dtype=out_dtype - ) - uniform_values = tvm.topi.add(tvm.topi.multiply(standard_uniform_values, high - low), low) - - return new_gen, uniform_values - - -def normal(gen, mean, scale, out_shape, out_dtype): - """Draw samples from a normal distribution. - The algorithm is based on Box-Muller transform - - Parameters - ---------- - gen : ThreefryKey - Generator state. Can be create with :py:func:`tvm.relay.threefry_key`. This should not be - reused in another function, otherwise random numbers will be repeated. - - mean : Tensor[(), out_dtype] - The mean of the normal distribution. - - scale : Tensor[(), out_dtype] - The standard deviation of the normal distribution. - - out_shape : Sequence[int] - Output shape of the random numbers. - - out_dtype : str - The output dtype. - - Returns - ------- - new_gen : ThreefryKey - New generator state that is distinct from `gen`. - - out : Tensor[out_shape, out_dtype] - Tensor of random numbers with shape `out_shape` and type `out_dtype`. - """ - out_shape = list(out_shape) - # Box-Muller transform need two pieces of original uniform data - out_shape.insert(0, 2) - new_gen, uniform_values = uniform( - gen, tvm.tir.const(0.0, out_dtype), tvm.tir.const(1.0, out_dtype), out_shape, out_dtype - ) - two_pi = tvm.tir.const(2.0 * math.pi, out_dtype) - uniform_values_1 = tvm.topi.strided_slice(uniform_values, [0], [1], strides=[1], axes=[0]) - uniform_values_1 = tvm.topi.squeeze(uniform_values_1, axis=0) - uniform_values_2 = tvm.topi.strided_slice(uniform_values, [1], [2], strides=[1], axes=[0]) - uniform_values_2 = tvm.topi.squeeze(uniform_values_2, axis=0) - uniform_values_1 = tvm.topi.subtract(tvm.tir.const(1.0, out_dtype), uniform_values_1) - sqrt_values = tvm.topi.sqrt( - tvm.topi.multiply(tvm.tir.const(-2.0, out_dtype), tvm.topi.log(uniform_values_1)) - ) - sin_values = tvm.topi.sin(tvm.topi.multiply(two_pi, uniform_values_2)) - random_values = tvm.topi.add( - tvm.topi.multiply(tvm.topi.multiply(sqrt_values, sin_values), scale), mean - ) - - return new_gen, random_values - - -def multinomial(gen, probs, num_samples): - """Draw samples from a multinomial distribution defined by the input tensor. - - Parameters - ---------- - gen : ThreefryKey - Generator state. Can be created with :py:func:`tvm.relay.threefry_key`. This should not be - reused in another function, otherwise random numbers will be repeated. - - probs: Tensor[(input_rows, indices), float] - A tensor containing the probabilities to sample from. Each value represents the - probability of choosing its corresponding index. If a tensor is provided, the last dimension - is treated independently. Negative values in this tensor will be clipped to zero to - represent they have no chance of being selected. - - num_samples: int - Number of samples to draw from each row. - - Returns - ------- - new_gen : ThreefryKey - New generator state that is distinct from `gen`. - - out : Tensor[(input_rows, num_samples), int64] - Tensor of sampled indices with shape `input_rows x num_samples` and type `out_dtype`. - """ - # Convert to float for consistent behavior. - probs = tvm.topi.cast(probs, "float32") - # Clip negative values to 0. - probs = tvm.topi.maximum(probs, 0) - # Normalize input probabilities. - probs = tvm.topi.divide(probs, tvm.topi.expand_dims(tvm.topi.sum(probs, axis=-1), -1)) - # Convert probability to cumulative sum. - cumulative_probs = tvm.topi.cumsum(probs, axis=-1) - # Sample a set of uniform values. - new_gen, uniform_values = uniform( - gen, - tvm.tir.const(0.0, "float32"), - tvm.tir.const(1.0, "float32"), - [*probs.shape[:-1], num_samples], - "float32", - ) - # Find index corresponding to sampled values. - closest_prob = tvm.topi.subtract( - tvm.topi.expand_dims(cumulative_probs, axis=-1), - tvm.topi.expand_dims(uniform_values, axis=-2), - ) - zeros = tvm.topi.full_like(closest_prob, 0) - ones = tvm.topi.full_like(closest_prob, 1) - # Find the smallest positive index for each sample. - cond = tvm.topi.greater(closest_prob, zeros) - closest_non_neg = tvm.topi.where(cond, closest_prob, ones) - sampled_indices = tvm.topi.argmin(closest_non_neg, axis=-2) - return new_gen, sampled_indices diff --git a/python/tvm/topi/sparse_fill_empty_rows.py b/python/tvm/topi/sparse_fill_empty_rows.py deleted file mode 100644 index 10dc6ee3bfa3..000000000000 --- a/python/tvm/topi/sparse_fill_empty_rows.py +++ /dev/null @@ -1,109 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHnew_sparse_indices WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# pylint: disable=no-else-return, too-many-locals, too-many-arguments, too-many-branches -# pylint: disable=undefined-variable, invalid-name -"""SparseFillEmptyRows operator""" -from ..te import hybrid - - -@hybrid.script -def _sparse_fill_empty_rows( - sparse_indices, - sparse_values, - dense_shape, - default_value, - new_sparse_indices_shape, - new_sparse_values_shape, - empty_row_indicator_shape, -): - default_value_ = int64(default_value[0]) - new_sparse_indices = output_tensor(new_sparse_indices_shape, "int64") - new_sparse_values = output_tensor(new_sparse_values_shape, "int64") - empty_row_indicator = output_tensor(empty_row_indicator_shape, "int64") - new_sparse_indices_row_id = 0 - - if int64(sparse_indices.shape[0]) == int64(0): # Handle Empty Case - # Fill all rows with default values - for i in range(0, new_sparse_indices_shape[0]): - new_sparse_indices[i, 0] = int64(i) - new_sparse_values[i] = default_value_ - empty_row_indicator[i] = int64(1) - for k in range(1, int64(new_sparse_indices_shape[1])): - new_sparse_indices[i, k] = int64(0) - - return (new_sparse_indices, new_sparse_values, empty_row_indicator) - - else: - # Iterate through sparse_indices and add rows if/when required - for i in range(0, int64(sparse_indices.shape[0])): - if i == 0: - prev_row_id = int64(0) - else: - prev_row_id = int64(sparse_indices[i - 1, 0] + 1) - row_id = int64(sparse_indices[i, 0]) - - # Since input is in row-major order, add rows between prev_row_id and row_id - for j in range(prev_row_id, row_id): - new_sparse_indices[new_sparse_indices_row_id, 0] = int64(j) - for k in range(1, int64(new_sparse_indices_shape[1])): - new_sparse_indices[new_sparse_indices_row_id, k] = int64(0) - empty_row_indicator[prev_row_id] = int64(1) - new_sparse_values[new_sparse_indices_row_id] = default_value_ - new_sparse_indices_row_id += 1 - - # Add current element to output - new_sparse_indices[new_sparse_indices_row_id, 0] = row_id - for k in range(1, int64(new_sparse_indices_shape[1])): - new_sparse_indices[new_sparse_indices_row_id, k] = int64(sparse_indices[i, k]) - new_sparse_values[new_sparse_indices_row_id] = int64(sparse_values[i]) - empty_row_indicator[row_id] = int64(0) - new_sparse_indices_row_id += 1 - - # Add rows with default value if last row id of sparse_indices is not dense_shape[0] - 1 - for i in range( - int64(sparse_indices[sparse_indices.shape[0] - 1, 0] + 1), int64(dense_shape[0]) - ): - - new_sparse_indices[new_sparse_indices_row_id, 0] = int64(i) - for k in range(1, int64(new_sparse_indices_shape[1])): - new_sparse_indices[new_sparse_indices_row_id, k] = int64(0) - empty_row_indicator[i] = int64(1) - new_sparse_values[new_sparse_indices_row_id] = default_value_ - new_sparse_indices_row_id += 1 - - return (new_sparse_indices, new_sparse_values, empty_row_indicator) - - -def sparse_fill_empty_rows( - sparse_indices, - sparse_values, - dense_shape, - default_value, - new_sparse_indices_shape, - new_sparse_values_shape, - empty_row_indicator_shape, -): - return _sparse_fill_empty_rows( - sparse_indices, - sparse_values, - dense_shape, - default_value, - new_sparse_indices_shape, - new_sparse_values_shape, - empty_row_indicator_shape, - ) diff --git a/python/tvm/topi/transform.py b/python/tvm/topi/transform.py index 2844825a4a73..c1f5bce94870 100644 --- a/python/tvm/topi/transform.py +++ b/python/tvm/topi/transform.py @@ -20,7 +20,6 @@ import tvm from tvm import te, topi -from tvm.te import hybrid from . import cpp, tag from .utils import const_vector, make_idx, within_index @@ -982,35 +981,6 @@ def adv_index(data, indices): return cpp.adv_index(data, indices) -@hybrid.script -def invert_permutation(data): - """Computes the inverse permutation of data. - - Parameters - ---------- - data : tvm.te.Tensor - Input data - - Returns - ------- - result : tvm.te.Tensor - Output tensor - - Examples - -------- - .. code-block:: python - - data = [3, 4, 0, 2, 1] - topi.invert_permutation(data) = [2, 4, 3, 0, 1] - """ - result = output_tensor(data.shape, data.dtype) - nums = data.shape[0] - for ind in range(nums): - r_ind = data[ind] - result[r_ind] = ind - return result - - def sliding_window(data, axis, window_shape, strides): """Slide a window over the data tensor. diff --git a/python/tvm/topi/unique.py b/python/tvm/topi/unique.py index 983c48615334..9c9732013413 100644 --- a/python/tvm/topi/unique.py +++ b/python/tvm/topi/unique.py @@ -17,9 +17,6 @@ # pylint: disable=invalid-name """Unique operator""" from tvm import te, tir -from ..te import hybrid -from .scan import cumsum -from .sort import sort, argsort def _calc_adjacent_diff_ir(data, output, binop=tir.Sub): @@ -82,234 +79,3 @@ def _calc_adjacent_diff(data, out_dtype="int32", binop=tir.Sub): name="_calc_adjacent_diff", tag="_calc_adjacent_diff_cpu", ) - - -@hybrid.script -def _calc_num_unique(inc_scan): - """Helper function to get the number of unique elements fron inc_scan tensor""" - output = output_tensor((1,), "int32") - output[0] = inc_scan[inc_scan.shape[0] - 1] + int32(1) - return output - - -def _calc_unique_ir( - data, argsorted_indices, inc_scan, index_converter, unique_elements, inverse_indices, counts -): - """Low level IR to calculate unique elements, inverse indices, and counts (optional) of - unique elements of 1-D array. - - Parameters - ---------- - data : Buffer - Input 1-D Buffer. - - argsorted_indices : Buffer - A buffer that stores the argsorted indices of the input data. - - inc_scan : Buffer - A buffer that stores the inclusive scan of the binary tir.NE adjacent difference - of the sorted data. - - index_converter (optional) : Buffer - An optional index converter that transforms the unique element index - such that new_idx = index_converter[old_idx]. - - unique_elements : Buffer - A buffer that stores the unique elements. - - inverse_indices : Buffer - A buffer that stores the index of each input data element in the unique element array. - - counts (optional) : Buffer - A buffer that stores the count of each unique element. - """ - ib = tir.ir_builder.create() - data_ptr = ib.buffer_ptr(data) - argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices) - inc_scan_ptr = ib.buffer_ptr(inc_scan) - unique_elements_ptr = ib.buffer_ptr(unique_elements) - inverse_indices_ptr = ib.buffer_ptr(inverse_indices) - - index_converter_ptr = None - if isinstance(index_converter, tir.Buffer): - index_converter_ptr = ib.buffer_ptr(index_converter) - - if isinstance(counts, tir.Buffer): - counts_ptr = ib.buffer_ptr(counts) - # use indices_ptr as a tmp buffer to store tids with inc_scan[tid] != inc_scan[tid-1] - unique_seq_indices_ptr = ib.buffer_ptr(inverse_indices) - - data_length = data.shape[0] - - # if need to return counts - if isinstance(counts, tir.Buffer): - num_unique = inc_scan_ptr[inc_scan.shape[0] - 1] + 1 - num_elements = data.shape[0] - unique_seq_indices_ptr[num_unique - 1] = num_elements - with ib.new_scope(): - with ib.for_range(0, data_length, kind="parallel") as i: - with ib.if_scope(i > 0): - with ib.if_scope(inc_scan_ptr[i] != inc_scan_ptr[i - 1]): - unique_seq_indices_ptr[inc_scan_ptr[i] - 1] = i - with ib.new_scope(): - with ib.for_range(0, num_unique, kind="parallel") as i: - unique_idx = i if not index_converter_ptr else index_converter_ptr[i] - with ib.if_scope(i == 0): - counts_ptr[unique_idx] = unique_seq_indices_ptr[i] - with ib.else_scope(): - counts_ptr[unique_idx] = ( - unique_seq_indices_ptr[i] - unique_seq_indices_ptr[i - 1] - ) - # calculate unique elements and inverse indices - with ib.new_scope(): - with ib.for_range(0, data_length, kind="parallel") as i: - data_idx = argsorted_indices_ptr[i] - unique_idx = ( - inc_scan_ptr[i] if not index_converter_ptr else index_converter_ptr[inc_scan_ptr[i]] - ) - inverse_indices_ptr[data_idx] = unique_idx - with ib.if_scope(i == 0): - unique_elements_ptr[unique_idx] = data_ptr[data_idx] - with ib.else_scope(): - with ib.if_scope(inc_scan_ptr[i] != inc_scan_ptr[i - 1]): - unique_elements_ptr[unique_idx] = data_ptr[data_idx] - return ib.get() - - -@hybrid.script -def _calc_first_occurence(argsorted_indices, inc_scan): - """Hybrid script to calculate the first occurence of each unique element in the input data. - - Parameters - ---------- - argsorted_indices : tvm.te.Tensor - A tensor that stores the argsorted indices of the input data. - - inc_scan : tvm.te.Tensor - A tensor that stores the inclusive scan of the binary tir.NE adjacent difference - of the sorted data. - - first_occurence : tvm.te.Tensor - A tensor that stores the first occurence of each unique element in the input data. - """ - first_occurence = output_tensor(argsorted_indices.shape, "int32") - for i in parallel(argsorted_indices.shape[0]): - first_occurence[i] = argsorted_indices.shape[0] - for i in parallel(argsorted_indices.shape[0]): - if i == 0 or inc_scan[i] != inc_scan[i - 1]: - first_occurence[inc_scan[i]] = argsorted_indices[i] - return first_occurence - - -def unique(data, is_sorted=True, return_counts=False): - """ - Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to - have the same length of `data` and element with index >= num_unique[0] has undefined value. - - Parameters - ---------- - data : tvm.te.Tensor - A 1-D tensor of integers. - - sorted : bool - Whether to sort the unique elements in ascending order before returning as output. - - return_counts : bool - Whether to return the count of each unique element. - - Returns - ------- - unique : tvm.te.Tensor - A 1-D tensor containing the unique elements of the input data tensor. The same size as - the input data. If there are less unique elements than input data, the end of the tensor - is padded with zeros. - - indices : tvm.te.Tensor - A 1-D tensor. The same size as output. For each entry in output, it contains - the index of its first occurence in the input data. The end of the tensor is padded - with the length of the input data. - - inverse_indices : tvm.te.Tensor - A 1-D tensor. For each entry in data, it contains the index of that data element in - the unique array. (Note that inverse_indices is very similar to indices if output is not - sorted.) - - num_unique : tvm.te.Tensor - A 1-D tensor with size=1 containing the number of unique elements in the input data tensor. - - counts (optional) : tvm.te.Tensor - A 1-D tensor containing the count of each unique element in the output. - - Examples - -------- - .. code-block:: python - - [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False) - output = [4, 5, 1, 2, 3, _, _, _] - indices = [0, 1, 2, 3, 4, _, _, _] - inverse_indices = [0, 1, 2, 3, 4, 4, 0, 1] - num_unique = [5] - - [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True) - output = [4, 5, 1, 2, 3, _, _, _] - indices = [0, 1, 2, 3, 4, _, _, _] - inverse_indices = [0, 1, 2, 3, 4, 4, 0, 1] - num_unique = [5] - counts = [2, 2, 1, 1, 2, _, _, _] - - [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True) - output = [1, 2, 3, 4, 5, _, _, _] - indices = [2, 3, 4, 0, 1, _, _, _] - inverse_indices = [3, 4, 0, 1, 2, 2, 3, 4] - num_unique = [5] - """ - sorted_data = sort(data) - argsorted_indices = argsort(data, dtype="int32") - # adjacent difference - adjacent_diff = _calc_adjacent_diff(sorted_data, "int32", tir.NE) - # inclusive scan - inc_scan = cumsum(adjacent_diff, dtype="int32", exclusive=0) - # total number of unique elements - num_unique_elements = _calc_num_unique(inc_scan) - # prepare outputs - if return_counts: - out_data_shape = [data.shape] * 3 - out_dtypes = [data.dtype, "int32", "int32"] - else: - out_data_shape = [data.shape] * 2 - out_dtypes = [data.dtype, "int32"] - # prepare inputs and fcompute - - first_occurence = _calc_first_occurence(argsorted_indices, inc_scan) - if is_sorted: - in_data = [data, argsorted_indices, inc_scan] - if return_counts: - fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs) - else: - fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs, None) - - indices = first_occurence - else: - # calculate index converter by sorting unique elements by their first occurence - argsorted_first_occurence = argsort(first_occurence, dtype="int32") - index_converter = argsort(argsorted_first_occurence, dtype="int32") - in_data = [data, argsorted_indices, inc_scan, index_converter] - if return_counts: - fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs) - else: - fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs, None) - # First occurence is in order of sorted unique output, if we sort the first_occurence array - # we get the correct result - indices = sort(first_occurence) - - outs = te.extern( - out_data_shape, - in_data, - fcompute, - dtype=out_dtypes, - name="_calc_unique", - tag="_calc_unique_cpu", - ) - if return_counts: - return [outs[0], indices, outs[1], num_unique_elements, outs[2]] - return [outs[0], indices, outs[1], num_unique_elements] diff --git a/python/tvm/topi/vision/__init__.py b/python/tvm/topi/vision/__init__.py deleted file mode 100644 index 2861d31de0f4..000000000000 --- a/python/tvm/topi/vision/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# pylint: disable=wildcard-import -"""VISION network operators""" -from __future__ import absolute_import as _abs - -from . import ssd -from .reorg import * -from .nms import * -from .rcnn import * diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py deleted file mode 100644 index 7bd94745e226..000000000000 --- a/python/tvm/topi/vision/nms.py +++ /dev/null @@ -1,1183 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=import-error, invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements, too-many-function-args -"""Non-maximum suppression operator""" -import tvm -from tvm import te - -from tvm.te import hybrid -from tvm.tir import if_then_else - -from ..sort import argsort -from ..math import cast -from ..transform import reshape, gather -from .. import reduction -from ..scan import cumsum -from .nms_util import ( - binary_search, - collect_selected_indices, - collect_selected_indices_and_scores, - run_all_class_nms, -) - - -@hybrid.script -def hybrid_rearrange_box_out(data, one, batch_size, num_anchors): - """Hybrid routine to rearrange nms output to - move all valid entries to top. - - Parameters - ---------- - data : tvm.te.Tensor or numpy NDArray - NMS output. 3-D tensor with shape - [batch_size, num_anchors, 6]. - - one: tvm.tir.const - Constant one with the same dtype as data. - - batch_size: tvm.tir.IntImm or tvm.tir.Var - Batch size. We need to pass it in since hybrid script doesn't support - binding variable to symbolic dim. - - num_anchors: tvm.tir.IntImm or tvm.tir.Var - Number of anchors. - - Returns - ------- - output : tvm.te.Tensor or numpy NDArray - Transformed NMS output. 3-D tensor with shape - [batch_size, num_anchors, 6]. - """ - elem_length = data.shape[2] - output = output_tensor((batch_size, num_anchors, elem_length), data.dtype) - valid_indices = allocate((batch_size,), "int32") - - for i in parallel(batch_size): - valid_indices[i] = 0 - for j in range(num_anchors): - if data[i, j, 0] >= 0: - for k in range(elem_length): - output[i, valid_indices[i], k] = data[i, j, k] - valid_indices[i] += 1 - if j >= valid_indices[i]: - for k in range(elem_length): - output[i, j, k] = -one - return output - - -@hybrid.script -def hybrid_rearrange_indices_out(data, one, batch_size, num_anchors): - """Hybrid routine to rearrange nms output to - move all valid entries to top. - - Parameters - ---------- - data : tvm.te.Tensor or numpy NDArray - NMS output. 3-D tensor with shape - [batch_size, num_anchors, 6] or - [batch_size, num_anchors, 5], or 2-D - tensor with shape [batch_size, num_anchors]. - - one: tvm.tir.const - Constant one with the same dtype as data. - - batch_size: tvm.tir.IntImm or tvm.tir.Var - Batch size. We need to pass it in since hybrid script doesn't support - binding variable to symbolic dim. - - num_anchors: tvm.tir.IntImm or tvm.tir.Var - Number of anchors. - - Returns - ------- - output : tvm.te.Tensor or numpy NDArray - 2-D tensor with shape [batch_size, num_anchors]. - - valid_box_count : tvm.te.Tensor or numpy NDArray - Tensor with shape [batch_size, 1], indicates - the valid number of boxes. - """ - valid_box_count = output_tensor((batch_size, 1), "int32") - output = output_tensor((batch_size, num_anchors), data.dtype) - valid_indices = allocate((batch_size,), "int32") - - for i in parallel(batch_size): - valid_indices[i] = 0 - for j in range(num_anchors): - if data[i, j] >= 0: - output[i, valid_indices[i]] = data[i, j] - valid_indices[i] += 1 - if data[i, j] > num_anchors or data[i, j] < -num_anchors: - output[i, valid_indices[i]] = 0 - valid_indices[i] += 1 - if j >= valid_indices[i]: - output[i, j] = -one - valid_box_count[i, 0] = valid_indices[i] - - return output, valid_box_count - - -@hybrid.script -def hybrid_get_valid_counts( - data, score_threshold, id_index, score_index, one, batch_size, num_anchors -): - """Hybrid routine to get valid count of bounding boxes - given a score threshold. Also moves valid boxes to the - top of input data. - - Parameters - ---------- - data : tvm.te.Tensor or numpy NDArray - Input data. 3-D tensor with shape [batch_size, num_anchors, 6] - or [batch_size, num_anchors, 5]. - - score_threshold : tvm.te.Tensor - Lower limit of score for valid bounding boxes. - - id_index : tvm.tir.const - index of the class categories, -1 to disable. - - score_index: tvm.tir.const - Index of the scores/confidence of boxes. - - one: tvm.tir.const - Constant one with the same dtype as data. - - batch_size: tvm.tir.IntImm or tvm.tir.Var - Batch size. We need to pass it in since hybrid script doesn't support - binding variable to symbolic dim. - - num_anchors: tvm.tir.IntImm or tvm.tir.Var - Number of anchors. - - Returns - ------- - valid_count : tvm.te.Tensor or numpy NDArray - 1-D tensor for valid number of boxes. - - out_tensor : tvm.te.Tensor or numpy NDArray - Rearranged data tensor. - - out_indices: tvm.te.Tensor or numpy NDArray - Related index in input data. - """ - box_data_length = data.shape[2] - valid_count = output_tensor((batch_size,), "int32") - out_tensor = output_tensor((batch_size, num_anchors, box_data_length), data.dtype) - out_indices = output_tensor((batch_size, num_anchors), "int32") - for i in parallel(batch_size): - valid_count[i] = 0 - for j in range(num_anchors): - score = data[i, j, score_index] - if score > score_threshold and (id_index < 0 or data[i, j, id_index] >= 0): - for k in range(box_data_length): - out_tensor[i, valid_count[i], k] = data[i, j, k] - out_indices[i, valid_count[i]] = j - valid_count[i] += 1 - if j >= valid_count[i]: - for k in range(box_data_length): - out_tensor[i, j, k] = -one - out_indices[i, j] = -1 - return valid_count, out_tensor, out_indices - - -def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): - """Get valid count of bounding boxes given a score threshold. - Also moves valid boxes to the top of input data. - - Parameters - ---------- - data : tvm.te.Tensor - Input data. 3-D tensor with shape [batch_size, num_anchors, 6] - or [batch_size, num_anchors, 5]. - - score_threshold : optional, float - Lower limit of score for valid bounding boxes. - - id_index : optional, int - index of the class categories, -1 to disable. - - score_index: optional, int - Index of the scores/confidence of boxes. - - Returns - ------- - valid_count : tvm.te.Tensor - 1-D tensor for valid number of boxes. - - out_tensor : tvm.te.Tensor - Rearranged data tensor. - - out_indices: tvm.te.Tensor or numpy NDArray - Related index in input data. - """ - if isinstance(score_threshold, (float, int)): - score_threshold = tvm.tir.const(score_threshold, dtype=data.dtype) - id_index_const = tvm.tir.const(id_index, "int32") - score_index_const = tvm.tir.const(score_index, "int32") - return hybrid_get_valid_counts( - data, - score_threshold, - id_index_const, - score_index_const, - tvm.tir.const(1, data.dtype), - data.shape[0], - data.shape[1], - ) - - -@hybrid.script -def hybrid_nms( - data, - sorted_index, - valid_count, - indices, - batch_size, - num_anchors, - max_output_size, - iou_threshold, - force_suppress, - top_k, - coord_start, - score_index, - id_index, - return_indices, - zero, - one, -): - """Hybrid routing for non-maximum suppression. - - Parameters - ---------- - data: tvm.te.Tensor or numpy NDArray - Bounding boxes with class and score. 3-D tensor with shape - [batch_size, num_anchors, 6]. It could be the second output - out_tensor of get_valid_counts. - - sorted_index : tvm.te.Tensor or numpy NDArray - Bounding box indexes sorted by score, with shape - [batch_size, num_anchors]. - - valid_count : tvm.te.Tensor or numpy NDArray - 1-D tensor for valid number of boxes. It could be the output - valid_count of get_valid_counts. - - indices : tvm.te.Tensor or numpy.NDArray - indices in original tensor, with shape [batch_size, num_anchors], - represents the index of box in original data. It could be the third - output out_indices of get_valid_counts. The values in the second - dimension are like the output of arange(num_anchors) if get_valid_counts - is not used before non_max_suppression. - - batch_size: tvm.tir.IntImm or tvm.tir.Var - Batch size. We need to pass it in since hybrid script doesn't support - binding variable to symbolic dim. - - num_anchors: tvm.tir.IntImm or tvm.tir.Var - The number of anchors. - - max_output_size : tvm.te.Tensor - Max number of output valid boxes for each instance. - Return all valid boxes if max_output_size < 0. - - iou_threshold : tvm.te.Tensor - Overlapping(IoU) threshold to suppress object with smaller score. - - force_suppress : tvm.tir.const - Whether to suppress all detections regardless of class_id. - - top_k : tvm.tir.const - Keep maximum top k detections before nms, -1 for no limit. - - coord_start : tvm.tir.const - Start index of the consecutive 4 coordinates. - - score_index: tvm.tir.const - Index of the scores/confidence of boxes. - - id_index : tvm.tir.const - index of the class categories, -1 to disable. - - return_indices : tvm.tir.const - Whether to return box indices in input data. - - zero: tvm.tir.const - Constant zero with the same dtype as data. - - one: tvm.tir.const - Constant one with the same dtype as data. - - Returns - ------- - output : tvm.te.Tensor - 3-D tensor with shape [batch_size, num_anchors, 6] - or [batch_size, num_anchors, 5]. - - box_indices: tvm.te.Tensor - 2-D tensor with shape [batch_size, num_anchors]. - """ - - box_data_length = data.shape[2] - - # box_indices is the expected indices of boxes - box_indices = output_tensor((batch_size, num_anchors), sorted_index.dtype) - output = output_tensor( - ( - batch_size, - num_anchors, - box_data_length, - ), - data.dtype, - ) - - for i in range(batch_size): - if iou_threshold > 0: - if valid_count[i] > 0: - # Reorder output - nkeep = valid_count[i] - if 0 < top_k < nkeep: - nkeep = top_k - for j in parallel(nkeep): - for k in range(box_data_length): - output[i, j, k] = data[i, sorted_index[i, j], k] - box_indices[i, j] = sorted_index[i, j] - if 0 < top_k < valid_count[i]: - for j in parallel(valid_count[i] - nkeep): - for k in range(box_data_length): - output[i, j + nkeep, k] = -one - box_indices[i, j + nkeep] = -1 - - # Apply nms - box_start_idx = coord_start - batch_idx = i - num_valid_boxes = 0 - - for j in range(valid_count[i]): - if num_valid_boxes == max_output_size: - for k in range(box_data_length): - output[i, j, k] = -one - box_indices[i, j] = -1 - - elif output[i, j, score_index] > 0: - box_a_idx = j - is_valid_box = 1 - - # a_l: left, a_t: top, a_r: right, a_b: bottom - a_l = min( - output[batch_idx, box_a_idx, box_start_idx], - output[batch_idx, box_a_idx, box_start_idx + 2], - ) - a_t = min( - output[batch_idx, box_a_idx, box_start_idx + 1], - output[batch_idx, box_a_idx, box_start_idx + 3], - ) - a_r = max( - output[batch_idx, box_a_idx, box_start_idx], - output[batch_idx, box_a_idx, box_start_idx + 2], - ) - a_b = max( - output[batch_idx, box_a_idx, box_start_idx + 1], - output[batch_idx, box_a_idx, box_start_idx + 3], - ) - - # check if current box j is valid by calculating iou with - # all existing valid boxes - for k in range(j): - check_iou = 0 - if ( - is_valid_box == 1 - and k < j - and output[i, k, score_index] > 0 - and (id_index < 0 or output[i, k, id_index] >= 0) - ): - if force_suppress: - check_iou = 1 - elif id_index < 0 or output[i, j, id_index] == output[i, k, id_index]: - check_iou = 1 - - if check_iou > 0: - box_b_idx = k - - # b_l: left, b_t: top, b_r: right, b_b: bottom - b_l = min( - output[batch_idx, box_b_idx, box_start_idx], - output[batch_idx, box_b_idx, box_start_idx + 2], - ) - b_t = min( - output[batch_idx, box_b_idx, box_start_idx + 1], - output[batch_idx, box_b_idx, box_start_idx + 3], - ) - b_r = max( - output[batch_idx, box_b_idx, box_start_idx], - output[batch_idx, box_b_idx, box_start_idx + 2], - ) - b_b = max( - output[batch_idx, box_b_idx, box_start_idx + 1], - output[batch_idx, box_b_idx, box_start_idx + 3], - ) - - # Overlapping width and height - w = max(zero, min(a_r, b_r) - max(a_l, b_l)) - h = max(zero, min(a_b, b_b) - max(a_t, b_t)) - - # Overlapping area - area = h * w - - # total area of the figure formed by box a and box b - # except for overlapping area - u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area - - # get the iou - iou = zero if u <= zero else area / u - - if iou >= iou_threshold: - is_valid_box = 0 - - if is_valid_box == 0: - for k in range(box_data_length): - output[i, j, k] = -one - box_indices[i, j] = -1 - else: - num_valid_boxes += 1 - - else: - for j in parallel(valid_count[i]): - for k in range(box_data_length): - output[i, j, k] = data[i, j, k] - box_indices[i, j] = j - - # Set invalid entry to be -1 - for j in parallel(num_anchors - valid_count[i]): - for k in range(box_data_length): - output[i, j + valid_count[i], k] = -one - box_indices[i, j + valid_count[i]] = -1 - - if return_indices: - for j in range(valid_count[i]): - idx = box_indices[i, j] - if box_indices[i, j] >= 0: - box_indices[i, j] = indices[i, idx] - - return output, box_indices - - -@tvm.target.generic_func -def non_max_suppression( - data, - valid_count, - indices, - max_output_size=-1, - iou_threshold=0.5, - force_suppress=False, - top_k=-1, - coord_start=2, - score_index=1, - id_index=0, - return_indices=True, - invalid_to_bottom=False, -): - """Non-maximum suppression operator for object detection. - - Parameters - ---------- - data : tvm.te.Tensor - 3-D tensor with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5]. - - valid_count : tvm.te.Tensor - 1-D tensor for valid number of boxes. - - indices : tvm.te.Tensor - 2-D tensor with shape [batch_size, num_anchors]. - - max_output_size : optional, int or tvm.te.Tensor - Max number of output valid boxes for each instance. - Return all valid boxes if the value of max_output_size is less than 0. - - iou_threshold : optional, float or tvm.te.Tensor - Non-maximum suppression threshold. - - force_suppress : optional, boolean - Whether to suppress all detections regardless of class_id. - - top_k : optional, int - Keep maximum top k detections before nms, -1 for no limit. - - coord_start : required, int - Start index of the consecutive 4 coordinates. - - score_index: optional, int - Index of the scores/confidence of boxes. - - id_index : optional, int - index of the class categories, -1 to disable. - - return_indices : optional, boolean - Whether to return box indices in input data. - - invalid_to_bottom : optional, boolean - Whether to move all valid bounding boxes to the top. - - Returns - ------- - out : tvm.te.Tensor or tuple of tvm.te.Tensor - 3-D tensor with shape [batch_size, num_anchors, 6] - or [batch_size, num_anchors, 5]. Out is a tuple of tvm.te.Tensor - if return_indices is True, the Tensor in the tuple is 2-D tensor - with shape [batch_size, num_anchors] and shape - [batch_size, num_valid_anchors] respectively. - - Example - -------- - .. code-block:: python - - # An example to use non_max_suppression - dshape = (1, 5, 6) - data = te.placeholder(dshape, name="data") - valid_count = te.placeholder((dshape[0],), dtype="int32", name="valid_count") - iou_threshold = 0.7 - force_suppress = True - top_k = -1 - out = non_max_suppression(data, valid_count, indices, iou_threshold=iou_threshold, - force_suppress=force_suppress, top_k=top_k) - np_data = np.random.uniform(dshape) - np_valid_count = np.array([4]) - s = topi.generic.schedule_nms(out) - f = tvm.build(s, [data, valid_count, out], "llvm") - dev = tvm.cpu() - tvm_data = tvm.nd.array(np_data, dev) - tvm_valid_count = tvm.nd.array(np_valid_count, dev) - tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev) - f(tvm_data, tvm_valid_count, tvm_out) - """ - batch_size = data.shape[0] - num_anchors = data.shape[1] - if isinstance(max_output_size, int): - max_output_size = tvm.tir.const(max_output_size, dtype="int32") - if isinstance(iou_threshold, float): - iou_threshold = tvm.tir.const(iou_threshold, dtype=data.dtype) - score_axis = score_index - score_shape = (batch_size, num_anchors) - score_tensor = te.compute(score_shape, lambda i, j: data[i, j, score_axis]) - sort_tensor = argsort(score_tensor, valid_count=valid_count, axis=1, is_ascend=False) - - out, box_indices = hybrid_nms( - data, - sort_tensor, - valid_count, - indices, - batch_size, - num_anchors, - max_output_size, - iou_threshold, - tvm.tir.const(force_suppress, dtype="bool"), - tvm.tir.const(top_k, dtype="int32"), - tvm.tir.const(coord_start, dtype="int32"), - tvm.tir.const(score_index, dtype="int32"), - tvm.tir.const(id_index, dtype="int32"), - tvm.tir.const(return_indices, dtype="bool"), - zero=tvm.tir.const(0, dtype=data.dtype), - one=tvm.tir.const(1, dtype=data.dtype), - ) - - if return_indices: - return hybrid_rearrange_indices_out( - box_indices, - one=tvm.tir.const(1, dtype="int32"), - batch_size=batch_size, - num_anchors=num_anchors, - ) - - if invalid_to_bottom: - out = hybrid_rearrange_box_out( - out, - one=tvm.tir.const(1, dtype=data.dtype), - batch_size=batch_size, - num_anchors=num_anchors, - ) - return out - - -def _nms_loop( - ib, - batch_size, - top_k, - iou_threshold, - max_output_size, - valid_count, - on_new_valid_box_func, - on_new_invalidated_box_func, - needs_bbox_check_func, - calc_overlap_func, - out_scores, - num_valid_boxes, -): - def nms_inner_loop(ib, i, j, nkeep, num_valid_boxes_local): - # The box j is valid, invalidate other boxes that overlap with j above iou_threshold - on_new_valid_box_func(ib, 0, num_valid_boxes_local[0], i, j) - num_valid_boxes_local[0] += 1 - - num_boxes_to_check = nkeep - (j + 1) - - with ib.for_range(0, num_boxes_to_check, name="_k", kind="parallel") as _k: - k = j + 1 + _k - - with ib.if_scope( - tvm.tir.all( - k < nkeep, - out_scores[i, k] > 0, # is the box k still valid? - needs_bbox_check_func(i, j, k), - ) - ): - iou = calc_overlap_func(i, j, k) - - with ib.if_scope(iou >= iou_threshold): - # invalidate the box k - out_scores[i, k] = -1.0 - on_new_invalidated_box_func(i, k) - - with ib.for_range(0, batch_size, name="i") as i: - nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) - max_output_size = if_then_else(max_output_size > 0, max_output_size, nkeep) - - with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): - num_valid_boxes_local = ib.allocate( - "int32", (1,), name="num_valid_boxes_local", scope="local" - ) - box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local") - num_valid_boxes_local[0] = 0 - box_idx[0] = 0 - - # Apply nms - # No need to do more iteration if we have already reached max_output_size boxes - with ib.while_loop( - tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size) - ): - # Proceed to the inner loop if the box with id box_idx is still valid - with ib.if_scope(out_scores[i, box_idx[0]] > -1.0): - nms_inner_loop(ib, i, box_idx[0], nkeep, num_valid_boxes_local) - box_idx[0] += 1 - - num_valid_boxes[i] = num_valid_boxes_local[0] - - with ib.else_scope(): - num_valid_boxes[i] = 0 - - return ib.get() - - -def _get_valid_box_count(scores, score_threshold): - batch_classes, num_boxes = scores.shape - - def searchsorted_ir(scores, valid_count): - ib = tvm.tir.ir_builder.create() - scores = ib.buffer_ptr(scores) - valid_count = ib.buffer_ptr(valid_count) - - with ib.for_range(0, batch_classes, name="i", kind="parallel") as i: - binary_search(ib, i, num_boxes, scores, score_threshold, valid_count) - - return ib.get() - - scores_buf = tvm.tir.decl_buffer(scores.shape, scores.dtype, "scores_buf", data_alignment=8) - - return te.extern( - [(batch_classes,)], - [scores], - lambda ins, outs: searchsorted_ir(ins[0], outs[0]), - dtype=["int32"], - in_buffers=[scores_buf], - name="searchsorted", - tag="searchsorted", - ) - - -def _collect_selected_indices_ir(num_class, selected_indices, num_detections, row_offsets, out): - batch_classes, _ = selected_indices.shape - - ib = tvm.tir.ir_builder.create() - - selected_indices = ib.buffer_ptr(selected_indices) - num_detections = ib.buffer_ptr(num_detections) - row_offsets = ib.buffer_ptr(row_offsets) - out = ib.buffer_ptr(out) - - with ib.for_range(0, batch_classes, name="i", kind="parallel") as i: - i = cast(i, "int64") - batch_id = i // num_class - class_id = i % num_class - - with ib.for_range(0, num_detections[i], name="j") as j: - out[row_offsets[i] + j, 0] = batch_id - out[row_offsets[i] + j, 1] = class_id - out[row_offsets[i] + j, 2] = cast(selected_indices[i, j], "int64") - - return ib.get() - - -def _collect_selected_indices_and_scores_ir( - selected_indices, - selected_scores, - num_detections, - row_offsets, - num_total_detections, - collected_indices, - collected_scores, -): - batch_size, num_class = row_offsets.shape - num_boxes = selected_indices.shape[1] - - ib = tvm.tir.ir_builder.create() - - selected_indices = ib.buffer_ptr(selected_indices) - selected_scores = ib.buffer_ptr(selected_scores) - num_detections = ib.buffer_ptr(num_detections) - row_offsets = ib.buffer_ptr(row_offsets) - num_total_detections = ib.buffer_ptr(num_total_detections) - collected_indices = ib.buffer_ptr(collected_indices) - collected_scores = ib.buffer_ptr(collected_scores) - zero = cast(0, "int64") - - with ib.for_range(0, batch_size * num_class, name="i", kind="parallel") as i: - i = cast(i, "int64") - batch_id = i // num_class - class_id = i % num_class - - with ib.for_range(0, num_boxes, name="j") as j: - with ib.if_scope(j < num_detections[batch_id, class_id]): - offset = row_offsets[batch_id, class_id] + j - collected_indices[batch_id, offset, 0] = class_id - collected_indices[batch_id, offset, 1] = cast(selected_indices[i, j], "int64") - collected_scores[batch_id, offset] = selected_scores[i, j] - with ib.else_scope(): - offset = ( - num_total_detections[batch_id] - + class_id * num_boxes - - row_offsets[batch_id, class_id] - + j - - num_detections[batch_id, class_id] - ) - collected_indices[batch_id, offset, 0] = zero - collected_indices[batch_id, offset, 1] = zero - collected_scores[batch_id, offset] = 0.0 - - return ib.get() - - -def all_class_non_max_suppression( - boxes, - scores, - max_output_boxes_per_class, - iou_threshold, - score_threshold, - output_format="onnx", -): - """Non-maximum suppression operator for object detection, corresponding to ONNX - NonMaxSuppression and TensorFlow combined_non_max_suppression. - NMS is performed for each class separately. - - Parameters - ---------- - boxes : tvm.te.Tensor - 3-D tensor with shape (batch_size, num_boxes, 4) - - scores: tvm.te.Tensor - 3-D tensor with shape (batch_size, num_classes, num_boxes) - - max_output_boxes_per_class : int or tvm.te.Tensor, optional - The maxinum number of output selected boxes per class - - iou_threshold : float or tvm.te.Tensor, optionaIl - IoU test threshold - - score_threshold : float or tvm.te.Tensor, optional - Score threshold to filter out low score boxes early - - output_format : str, optional - "onnx" or "tensorflow", see below. - - Returns - ------- - out : list of tvm.te.Tensor - If `output_format` is "onnx", the output is two tensors. The first is `indices` of size - `(batch_size * num_class* num_boxes , 3)` and the second is a scalar tensor - `num_total_detection` of shape `(1,)` representing the total number of selected - boxes. The three values in `indices` encode batch, class, and box indices. - Rows of `indices` are ordered such that selected boxes from batch 0, class 0 come - first, in descending of scores, followed by boxes from batch 0, class 1 etc. Out of - `batch_size * num_class* num_boxes` rows of indices, only the first `num_total_detection` - rows are valid. - - If `output_format` is "tensorflow", the output is three tensors, the first - is `indices` of size `(batch_size, num_class * num_boxes , 2)`, the second is `scores` of - size `(batch_size, num_class * num_boxes)`, and the third is `num_total_detection` of size - `(batch_size,)` representing the total number of selected boxes per batch. The two values - in `indices` encode class and box indices. Of num_class * num_boxes boxes in `indices` at - batch b, only the first `num_total_detection[b]` entries are valid. The second axis of - `indices` and `scores` are sorted within each class by box scores, but not across classes. - So the box indices and scores for the class 0 come first in a sorted order, followed by - the class 1 etc. - """ - batch, num_class, num_boxes = scores.shape - scores = reshape(scores, (batch * num_class, num_boxes)) - - sorted_indices = argsort(scores, axis=1, is_ascend=False, dtype="int32") - sorted_scores = gather(scores, 1, sorted_indices) - - valid_count = _get_valid_box_count(sorted_scores, score_threshold) - - selected_indices, selected_scores, num_detections = run_all_class_nms( - boxes, - sorted_scores, - sorted_indices, - valid_count, - max_output_boxes_per_class, - iou_threshold, - _nms_loop, - return_scores=(output_format == "tensorflow"), - ) - - if output_format == "onnx": - row_offsets = cumsum(num_detections, exclusive=True, dtype="int64") - num_total_detections = reduction.sum(cast(num_detections, "int64"), axis=1) - - selected_indices = collect_selected_indices( - num_class, selected_indices, num_detections, row_offsets, _collect_selected_indices_ir - ) - return [selected_indices, num_total_detections] - - num_detections_per_batch = reshape(num_detections, (batch, num_class)) - row_offsets = cumsum(num_detections_per_batch, exclusive=True, dtype="int64", axis=1) - num_total_detections = reduction.sum(cast(num_detections_per_batch, "int64"), axis=1) - - selected_indices, selected_scores = collect_selected_indices_and_scores( - selected_indices, - selected_scores, - num_detections_per_batch, - row_offsets, - num_total_detections, - _collect_selected_indices_and_scores_ir, - ) - - return [selected_indices, selected_scores, num_total_detections] - - -@hybrid.script -def hybrid_regular_nms( - boxes, - scores, - max_detections_per_class, - max_detections, - batch_size, - num_boxes, - num_classes, - num_classes_with_background, - iou_threshold, - score_threshold, -): - """Hybrid routing for regular non-maximum suppression. - - Parameters - ---------- - boxes : tvm.te.Tensor - 3-D tensor with shape (batch_size, num_boxes, 4) - - scores: tvm.te.Tensor - 3-D tensor with shape (batch_size, num_boxes, num_classes_with_background) - - max_detections_per_class : tvm.tir.const - The maxinum number of output selected boxes per class - - max_detections : tvm.tir.const - The maxinum number of output selected boxes - - batch_size : tvm.tir.IntImm or tvm.tir.Var - The number of batches - - num_boxes : tvm.tir.IntImm or tvm.tir.Var - The number of bounding boxes - - num_classes : tvm.tir.const - The number of classes without background - - num_classes_with_background : tvm.tir.IntImm or tvm.tir.Var - The number of classes including background ones - - iou_threshold : tvm.tir.const - IoU test threshold - - score_threshold : tvm.tir.const - Score threshold to filter out low score boxes early - - Returns - ------- - detection_boxes : tvm.te.Tensor - 3-D tensor with shape [batch_size, max_detections, 4]. - - detection_classes : tvm.te.Tensor - 2-D tensor with shape [batch_size, max_detections]. - - detection_scores : tvm.te.Tensor - 2-D tensor with shape [batch_size, max_detections]. - - num_detections : tvm.te.Tensor - 1-D tensor with shape [batch_size]. - """ - # output tensors - detection_boxes = output_tensor((batch_size, max_detections, 4), boxes.dtype) - detection_classes = output_tensor((batch_size, max_detections), "int32") - detection_scores = output_tensor((batch_size, max_detections), scores.dtype) - num_detections = output_tensor((batch_size,), "int32") - - # scratch buffers - class_scores = allocate((num_boxes,), scores.dtype) - keep_indices = allocate((num_boxes,), "int32") - keep_scores = allocate((num_boxes,), scores.dtype) - sorted_indices = allocate((max_detections + num_boxes,), "int32") - sorted_scores = allocate((max_detections + num_boxes,), scores.dtype) - active_box_candidate = allocate((num_boxes,), "int32") - selected = allocate((num_boxes,), "int32") - box_indices_after_regular_nms = allocate((max_detections + num_boxes,), "int32") - scores_after_regular_nms = allocate((max_detections + num_boxes,), scores.dtype) - - label_offset = num_classes_with_background - num_classes - tmp_idx = 0 - - for batch_idx in range(batch_size): - size_of_sorted_indices = 0 - - for class_id in range(num_classes): - for box_id in range(num_boxes): - # get scores of boxes corresponding to all anchors for single class - class_scores[box_id] = scores[batch_idx, box_id, class_id + label_offset] - - # perform non-maximal suppression on single class - - # select detections above score threshold - num_scores_kept = 0 - for i in range(num_boxes): - if class_scores[i] >= score_threshold: - keep_scores[num_scores_kept] = class_scores[i] - keep_indices[num_scores_kept] = i - num_scores_kept += 1 - - # iota - for i in range(num_scores_kept): - sorted_indices[i] = i - # decreasing sort of scores - for i in range(num_scores_kept): - for j in range(num_scores_kept - i - 1): - if keep_scores[sorted_indices[j]] < keep_scores[sorted_indices[j + 1]]: - tmp_idx = sorted_indices[j] - sorted_indices[j] = sorted_indices[j + 1] - sorted_indices[j + 1] = tmp_idx - - selected_size = 0 - - for i in range(num_scores_kept): - active_box_candidate[i] = 1 - - num_active_candidate = num_scores_kept - for i in range(num_scores_kept): - if ( - num_active_candidate != 0 - and selected_size < min(num_scores_kept, max_detections_per_class) - and active_box_candidate[i] == 1 - ): - selected[selected_size] = keep_indices[sorted_indices[i]] - selected_size += 1 - - active_box_candidate[i] = 0 - num_active_candidate -= 1 - - for j in range(i + 1, num_scores_kept): - if active_box_candidate[j] == 1: - # compute IOU - i_ymin = boxes[batch_idx, keep_indices[sorted_indices[i]], 0] - i_xmin = boxes[batch_idx, keep_indices[sorted_indices[i]], 1] - i_ymax = boxes[batch_idx, keep_indices[sorted_indices[i]], 2] - i_xmax = boxes[batch_idx, keep_indices[sorted_indices[i]], 3] - - j_ymin = boxes[batch_idx, keep_indices[sorted_indices[j]], 0] - j_xmin = boxes[batch_idx, keep_indices[sorted_indices[j]], 1] - j_ymax = boxes[batch_idx, keep_indices[sorted_indices[j]], 2] - j_xmax = boxes[batch_idx, keep_indices[sorted_indices[j]], 3] - - area_i = (i_ymax - i_ymin) * (i_xmax - i_xmin) - area_j = (j_ymax - j_ymin) * (j_xmax - j_xmin) - - iou = 0.0 - if area_i > 0 and area_j > 0: - intersection_ymin = max(i_ymin, j_ymin) - intersection_xmin = max(i_xmin, j_xmin) - intersection_ymax = min(i_ymax, j_ymax) - intersection_xmax = min(i_xmax, j_xmax) - intersection_area = max( - intersection_ymax - intersection_ymin, 0.0 - ) * max(intersection_xmax - intersection_xmin, 0.0) - iou = intersection_area / (area_i + area_j - intersection_area) - - if iou > iou_threshold: - active_box_candidate[j] = 0 - num_active_candidate -= 1 - - # end of non-maximal suppression on single class - - # add selected indices from non-max suppression of boxes in this class - output_index = size_of_sorted_indices - for i in range(selected_size): - selected_index = selected[i] - - box_indices_after_regular_nms[output_index] = ( - selected_index * num_classes_with_background + class_id + label_offset - ) - scores_after_regular_nms[output_index] = class_scores[selected_index] - - output_index += 1 - - # sort the max scores among the selected indices - # get the indices for top scores - num_indices_to_sort = min(output_index, max_detections) - - # iota - for i in range(output_index): - sorted_indices[i] = i - # deacreasing sort of scores - for i in range(output_index): - for j in range(output_index - i - 1): - if ( - scores_after_regular_nms[sorted_indices[j]] - < scores_after_regular_nms[sorted_indices[j + 1]] - ): - tmp_idx = sorted_indices[j] - sorted_indices[j] = sorted_indices[j + 1] - sorted_indices[j + 1] = tmp_idx - - # copy values to temporary vectors - for i in range(num_indices_to_sort): - sorted_scores[i] = scores_after_regular_nms[sorted_indices[i]] - sorted_indices[i] = box_indices_after_regular_nms[sorted_indices[i]] - - # copy scores and indices from temporary vectors - for i in range(num_indices_to_sort): - box_indices_after_regular_nms[i] = sorted_indices[i] - scores_after_regular_nms[i] = sorted_scores[i] - - size_of_sorted_indices = num_indices_to_sort - - # fill output tensors - for output_box_index in range(max_detections): - box_ymin = 0.0 - box_xmin = 0.0 - box_ymax = 0.0 - box_xmax = 0.0 - class_idx = 0 - selected_score = 0.0 - - if output_box_index < size_of_sorted_indices: - anchor_idx = ( - box_indices_after_regular_nms[output_box_index] // num_classes_with_background - ) - - box_ymin = boxes[batch_idx, anchor_idx, 0] - box_xmin = boxes[batch_idx, anchor_idx, 1] - box_ymax = boxes[batch_idx, anchor_idx, 2] - box_xmax = boxes[batch_idx, anchor_idx, 3] - class_idx = ( - box_indices_after_regular_nms[output_box_index] - - anchor_idx * num_classes_with_background - - label_offset - ) - selected_score = scores_after_regular_nms[output_box_index] - - detection_boxes[batch_idx, output_box_index, 0] = box_ymin - detection_boxes[batch_idx, output_box_index, 1] = box_xmin - detection_boxes[batch_idx, output_box_index, 2] = box_ymax - detection_boxes[batch_idx, output_box_index, 3] = box_xmax - detection_classes[batch_idx, output_box_index] = class_idx - detection_scores[batch_idx, output_box_index] = selected_score - - num_detections[batch_idx] = size_of_sorted_indices - - return detection_boxes, detection_classes, detection_scores, num_detections - - -def regular_non_max_suppression( - boxes, - scores, - max_detections_per_class, - max_detections, - num_classes, - iou_threshold, - score_threshold, -): - """Regular non-maximum suppression operator for object detection, corresponding to TFLite's - regular NMS. NMS is performed for each class separately. - - Parameters - ---------- - boxes : tvm.te.Tensor - 3-D tensor with shape (batch_size, num_boxes, 4). The four values in boxes - encode (ymin, xmin, ymax, xmax) coordinates of a box - - scores: tvm.te.Tensor - 3-D tensor with shape (batch_size, num_boxes, num_classes_with_background) - - max_detections_per_class : int - The maxinum number of output selected boxes per class - - max_detections : int - The maxinum number of output selected boxes - - num_classes : int - The number of classes without background - - iou_threshold : float - IoU test threshold - - score_threshold : float - Score threshold to filter out low score boxes early - - Returns - ------- - out : list of tvm.te.Tensor - The output is a list of four tensors. The first is `detection_boxes` of size - `(batch_size, max_detections , 4)`, the second is `detection_classes` of size - `(batch_size, max_detections)`, the third is `detection_scores` of size - `(batch_size, max_detections)`, and the fourth is `num_detections` of size `(batch_size,)` - representing the total number of selected boxes per batch. - """ - batch_size, num_boxes, num_classes_with_background = scores.shape - - detection_boxes, detection_classes, detection_scores, num_detections = hybrid_regular_nms( - boxes=boxes, - scores=scores, - max_detections_per_class=tvm.tir.const(max_detections_per_class, dtype="int32"), - max_detections=tvm.tir.const(max_detections, dtype="int32"), - batch_size=batch_size, - num_boxes=num_boxes, - num_classes=tvm.tir.const(num_classes, dtype="int32"), - num_classes_with_background=num_classes_with_background, - iou_threshold=tvm.tir.const(iou_threshold, dtype="float32"), - score_threshold=tvm.tir.const(score_threshold, dtype="float32"), - ) - - return [ - detection_boxes, - cast(detection_classes, dtype="float32"), - detection_scores, - num_detections, - ] diff --git a/python/tvm/topi/vision/nms_util.py b/python/tvm/topi/vision/nms_util.py deleted file mode 100644 index d12592fd111a..000000000000 --- a/python/tvm/topi/vision/nms_util.py +++ /dev/null @@ -1,338 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name -"""Common utilities used in Non-maximum suppression operators""" -import tvm -from tvm import te - - -def _get_boundaries(output, box_idx): - l = tvm.te.min( - output[box_idx], - output[box_idx + 2], - ) - t = tvm.te.min( - output[box_idx + 1], - output[box_idx + 3], - ) - r = tvm.te.max( - output[box_idx], - output[box_idx + 2], - ) - b = tvm.te.max( - output[box_idx + 1], - output[box_idx + 3], - ) - return l, t, r, b - - -def calculate_overlap(out_tensor, box_a_idx, box_b_idx): - """Calculate overlap of two boxes.""" - a_l, a_t, a_r, a_b = _get_boundaries(out_tensor, box_a_idx) - b_l, b_t, b_r, b_b = _get_boundaries(out_tensor, box_b_idx) - - # Overlapping width and height - w = tvm.te.max(0.0, tvm.te.min(a_r, b_r) - tvm.te.max(a_l, b_l)) - h = tvm.te.max(0.0, tvm.te.min(a_b, b_b) - tvm.te.max(a_t, b_t)) - - # Overlapping area - area = h * w - - # total area of the figure formed by box a and box b - # except for overlapping area - u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area - return tvm.tir.Select(u <= 0.0, 0.0, area / u) - - -def binary_search(ib, y, num_boxes, scores, score_threshold, out): - """Binary search for score_threshold on scores sorted in descending order""" - lo = ib.allocate("int32", (1,), name="lo", scope="local") - hi = ib.allocate("int32", (1,), name="hi", scope="local") - - lo[0] = 0 - hi[0] = num_boxes - - with ib.while_loop(lo[0] < hi[0]): - mid = (hi[0] + lo[0]) >> 1 - with ib.if_scope(scores[y, mid] > score_threshold): - lo[0] = mid + 1 - with ib.else_scope(): - hi[0] = mid - - out[y] = lo[0] - - -def collect_selected_indices(num_class, selected_indices, num_detections, row_offsets, ir): - """Collect selected indices from the core NMS loop into one linear output - - Parameters - ---------- - num_class : int - - selected_indices: tvm.te.Tensor - 2-D tensor with shape (batch_size * num_classes, num_boxes), representing the indices - of selected boxes by the core NMS loop. - - num_detections tvm.te.Tensor - 1-D tensor with shape (batch_size * num_classes,), representing - the number of boxes selected by the core NMS loop, per batch and class - - row_offsets tvm.te.Tensor - 1-D tensor with shape (batch_size * num_classes,), this should be the exclusive scan - of num_detections - - ir : function - A function to generate IR for CPU or GPU, see its usage in vision/nms.py and cuda/nms.py - - Returns - ------- - out : tvm.te.Tensor - The output is indices of size (batch_size * num_class* num_boxes , 3). - Rows of indices are ordered such that selected boxes from batch 0, class 0 come - first, in descending of scores, followed by boxes from batch 0, class 1 etc. - """ - batch_class, num_boxes = selected_indices.shape - return te.extern( - [(batch_class * num_boxes, 3)], - [selected_indices, num_detections, row_offsets], - lambda ins, outs: ir(num_class, ins[0], ins[1], ins[2], outs[0]), - dtype=["int64"], - name="collect_indices", - tag="collect_indices", - ) - - -def collect_selected_indices_and_scores( - selected_indices, selected_scores, num_detections, row_offsets, num_total_detections, ir -): - """Collect selected indices and scores from the core NMS loop into one linear output - - Parameters - ---------- - num_class : int - - selected_indices: tvm.te.Tensor - 2-D tensor with shape (batch_size * num_classes, num_boxes), representing the indices - of selected boxes by the core NMS loop. - - selected_indices: tvm.te.Tensor - 2-D tensor with shape (batch_size * num_classes, num_boxes), representing the scores - of selected boxes by the core NMS loop. - - num_detections tvm.te.Tensor - 2-D tensor with shape (batch_size, num_classes), representing - the number of boxes selected by the core NMS loop, per batch and class - - row_offsets tvm.te.Tensor - 2-D tensor with shape (batch_size, num_classes), this should be the exclusive scan - of num_detections along axis 1 - - ir : function - A function to generate IR for CPU or GPU, see its usage in vision/nms.py and cuda/nms.py - - Returns - ------- - out : [tvm.te.Tensor, tvm.te.Tensor] - The output is two tensors. The first is indices of size - (batch_size, num_class* num_boxes, 2), and the second is scores of size - (batch_size, num_class* num_boxes). - """ - batch_size, num_class = row_offsets.shape - num_boxes = selected_indices.shape[1] - return te.extern( - [(batch_size, num_class * num_boxes, 2), (batch_size, num_class * num_boxes)], - [selected_indices, selected_scores, num_detections, row_offsets, num_total_detections], - lambda ins, outs: ir(ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], outs[1]), - dtype=["int64", "float32"], - name="collect_indices_and_scores", - tag="collect_indices_and_scores", - ) - - -def _all_class_nms_ir( - boxes, - sorted_scores, - sorted_indices, - valid_count, - batch_class, - num_class, - num_anchors, - iou_threshold, - max_output_size_per_class, - box_indices, - selected_scores, - num_valid_boxes, - nms_loop, -): - ib = tvm.tir.ir_builder.create() - boxes = ib.buffer_ptr(boxes) - sorted_scores = ib.buffer_ptr(sorted_scores) - sorted_indices = ib.buffer_ptr(sorted_indices) - valid_count = ib.buffer_ptr(valid_count) - box_indices = ib.buffer_ptr(box_indices) - num_valid_boxes = ib.buffer_ptr(num_valid_boxes) - - if selected_scores is not None: - selected_scores = ib.buffer_ptr(selected_scores) - - if isinstance(iou_threshold, float): - iou_threshold = tvm.tir.FloatImm("float32", iou_threshold) - - if isinstance(max_output_size_per_class, int): - max_output_size_per_class = tvm.tir.const(max_output_size_per_class) - - def calc_overlap(i, j, k): - offset_j = sorted_indices[i, j] * 4 - offset_k = sorted_indices[i, k] * 4 - batch_id = i // num_class - base_bbox_idx = batch_id * num_anchors * 4 - return calculate_overlap( - boxes, - base_bbox_idx + offset_j, - base_bbox_idx + offset_k, - ) - - def on_new_valid_box(ib, tid, num_current_valid_box, i, j): - with ib.if_scope(tid + 0 == 0): - box_indices[i, num_current_valid_box] = sorted_indices[i, j] - - if selected_scores is not None: - selected_scores[i, num_current_valid_box] = sorted_scores[i, j] - - def on_new_invalidated_box(*_): - pass - - def needs_bbox_check(*_): - return tvm.tir.const(True) - - return nms_loop( - ib, - batch_class, - tvm.tir.IntImm("int32", -1), # top_k - iou_threshold, - max_output_size_per_class, - valid_count, - on_new_valid_box, - on_new_invalidated_box, - needs_bbox_check, - calc_overlap, - sorted_scores, - num_valid_boxes, - ) - - -def run_all_class_nms( - boxes, - sorted_scores, - sorted_indices, - valid_count, - max_output_size_per_class, - iou_threshold, - nms_loop, - return_scores=False, -): - """The core all class NMS routine - - Parameters - ---------- - boxes : tvm.te.Tensor - 3-D tensor with shape (batch_size, num_boxes, 4) - - sorted_scores: tvm.te.Tensor - 2-D tensor with shape (batch_size * num_classes, num_boxes) - One of the outputs from argsort - - sorted_indices: tvm.te.Tensor - 2-D tensor with shape (batch_size * num_classes, num_boxes) - The other output from argsort - - valid_count: tvm.te.Tensor - 1-D tensor with shape (batch_size * num_classes,), representing - the number of boxes whose score is above score_threshold, per batch and class - - max_output_boxes_per_class : int or tvm.te.Tensor, optional - The maxinum number of output selected boxes per class - - iou_threshold : float or tvm.te.Tensor, optionaIl - IoU test threshold - - nms_loop : function - A core NMS loop, see its usage in vision/nms.py and cuda/nms.py - - return_scores : bool, optional - Whether or not to return selected scores, needed by the tensorflow output format. - - Returns - ------- - out : a list of tvm.te.Tensor - The output is three tensors, the first and second are indices and scores of size - (batch_size * num_class, num_boxes), and the third is a tensor - num_selected_boxes of shape (batch_size * num_class,) representing the total number of - selected boxes per batch and class. If return_scores is False, the second output is - None. - """ - batch, num_boxes, _ = boxes.shape - batch_class = sorted_scores.shape[0] - num_class = batch_class // batch - - if return_scores is False: - selected_indices, num_detections = te.extern( - [(batch_class, num_boxes), (1, batch_class)], - [boxes, sorted_scores, sorted_indices, valid_count], - lambda ins, outs: _all_class_nms_ir( - ins[0], # boxes - ins[1], # sorted_scores - ins[2], # sorted_indices - ins[3], # valid_count - batch_class, - num_class, - num_boxes, - iou_threshold, - max_output_size_per_class, - outs[0], # box_indices - None, # scores - outs[1], # num_selected_boxes - nms_loop, - ), - dtype=["int32", "int32"], - name="all_class_nms", - tag="all_class_nms", - ) - return selected_indices, None, num_detections - - return te.extern( - [(batch_class, num_boxes), (batch_class, num_boxes), (1, batch_class)], - [boxes, sorted_scores, sorted_indices, valid_count], - lambda ins, outs: _all_class_nms_ir( - ins[0], # boxes - ins[1], # sorted_scores - ins[2], # sorted_indices - ins[3], # valid_count - batch_class, - num_class, - num_boxes, - iou_threshold, - max_output_size_per_class, - outs[0], # box_indices - outs[1], # selected scores - outs[2], # num_selected_boxes - nms_loop, - ), - dtype=["int32", "float32", "int32"], - name="all_class_nms", - tag="all_class_nms", - ) diff --git a/python/tvm/topi/vision/rcnn/__init__.py b/python/tvm/topi/vision/rcnn/__init__.py deleted file mode 100644 index e5693e869445..000000000000 --- a/python/tvm/topi/vision/rcnn/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# pylint: disable=wildcard-import -"""Faster R-CNN and Mask R-CNN operators""" -from .roi_align import * -from .roi_pool import * -from .proposal import * diff --git a/python/tvm/topi/vision/rcnn/proposal.py b/python/tvm/topi/vision/rcnn/proposal.py deleted file mode 100644 index 12a0d6bcf0a0..000000000000 --- a/python/tvm/topi/vision/rcnn/proposal.py +++ /dev/null @@ -1,448 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name, singleton-comparison, bad-continuation -"""Proposal operator""" -import math -import tvm -from tvm import te -from ...utils import get_const_tuple, get_const_int -from ...sort import argsort - - -def generate_anchor(ratio, scale, base_size): - """Generate anchor""" - w = h = float(base_size) - x_ctr = 0.5 * (w - 1.0) - y_ctr = 0.5 * (h - 1.0) - size = w * h - size_ratios = math.floor(size / ratio) - new_w = math.floor(math.sqrt(size_ratios) + 0.5) * scale - new_h = math.floor((new_w / scale * ratio) + 0.5) * scale - return ( - x_ctr - 0.5 * (new_w - 1.0), - y_ctr - 0.5 * (new_h - 1.0), - x_ctr + 0.5 * (new_w - 1.0), - y_ctr + 0.5 * (new_h - 1.0), - ) - - -def reg_bbox(x1, y1, x2, y2, dx, dy, dw, dh): - """Bounding box regression function""" - bbox_w = x2 - x1 + 1.0 - bbox_h = y2 - y1 + 1.0 - ctr_x = x1 + 0.5 * (bbox_w - 1.0) - ctr_y = y1 + 0.5 * (bbox_h - 1.0) - - pred_ctr_x = dx * bbox_w + ctr_x - pred_ctr_y = dy * bbox_h + ctr_y - pred_w = te.exp(dw) * bbox_w - pred_h = te.exp(dh) * bbox_h - - pred_x1 = pred_ctr_x - 0.5 * (pred_w - 1.0) - pred_y1 = pred_ctr_y - 0.5 * (pred_h - 1.0) - pred_x2 = pred_ctr_x + 0.5 * (pred_w - 1.0) - pred_y2 = pred_ctr_y + 0.5 * (pred_h - 1.0) - return pred_x1, pred_y1, pred_x2, pred_y2 - - -def reg_iou(x1, y1, x2, y2, dx1, dy1, dx2, dy2): - """Bounding box regression function""" - pred_x1 = x1 + dx1 - pred_y1 = y1 + dy1 - pred_x2 = x2 + dx2 - pred_y2 = y2 + dy2 - return pred_x1, pred_y1, pred_x2, pred_y2 - - -def predict_bbox_ir( - cls_prob_buf, - bbox_pred_buf, - im_info_buf, - out_buf, - scales, - ratios, - feature_stride, - rpn_min_size, - iou_loss, -): - """Predict bounding boxes based on anchors, scores and deltas. - - Parameters - ---------- - cls_prob_buf : tvm.te.schedule.Buffer - 4-D with shape [batch, 2 * num_anchors, height, width] - - bbox_pred_buf : tvm.te.schedule.Buffer - 4-D with shape [batch, 4 * num_anchors, height, width] - - im_info_buf : tvm.te.schedule.Buffer - 2-D with shape [batch, 3] - - out_buf : tvm.te.schedule.Buffer - 3-D with shape [batch, num_bbox, 5] - The last dimension is in format of [w_start, h_start, w_end, h_end, score] - - scales : list/tuple of float - Scales of anchor windows. - - ratios : list/tuple of float - Ratios of anchor windows. - - feature_stride : int - The size of the receptive field each unit in the convolution layer of the rpn, for example - the product of all stride's prior to this layer. - - rpn_min_size : int - Minimum height or width in proposal. - - iou_loss : bool - Usage of IoU loss. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - batch, num_anchors, height, width = get_const_tuple(cls_prob_buf.shape) - num_anchors //= 2 - ib = tvm.tir.ir_builder.create() - - p_score = ib.buffer_ptr(cls_prob_buf) - p_delta = ib.buffer_ptr(bbox_pred_buf) - p_im_info = ib.buffer_ptr(im_info_buf) - p_out = ib.buffer_ptr(out_buf) - - idxm = tvm.tir.indexmod - idxd = tvm.tir.indexdiv - - with ib.for_range(0, batch * height * width) as tid: - w = idxm(tid, width) - h = idxm(idxd(tid, width), height) - b = idxd(idxd(tid, width), height) - - for k in range(num_anchors): - out_index = tid * num_anchors + k - ratio = ratios[k // len(scales)] - scale = scales[k % len(scales)] - anchor = generate_anchor(ratio, scale, feature_stride) - im_height = p_im_info[b * 3] - im_width = p_im_info[b * 3 + 1] - x1 = anchor[0] + w * feature_stride - y1 = anchor[1] + h * feature_stride - x2 = anchor[2] + w * feature_stride - y2 = anchor[3] + h * feature_stride - - delta = [ - p_delta[((((b * num_anchors + k) * 4 + i) * height + h) * width + w)] - for i in range(4) - ] - regression_func = reg_iou if iou_loss else reg_bbox - pred_x1, pred_y1, pred_x2, pred_y2 = regression_func(x1, y1, x2, y2, *delta) - - pred_x1 = tvm.te.max(tvm.te.min(pred_x1, im_width - 1.0), 0.0) - pred_y1 = tvm.te.max(tvm.te.min(pred_y1, im_height - 1.0), 0.0) - pred_x2 = tvm.te.max(tvm.te.min(pred_x2, im_width - 1.0), 0.0) - pred_y2 = tvm.te.max(tvm.te.min(pred_y2, im_height - 1.0), 0.0) - - real_height = (im_height / feature_stride).astype("int32") - real_width = (im_width / feature_stride).astype("int32") - - bbox_w = pred_x2 - pred_x1 + 1.0 - bbox_h = pred_y2 - pred_y1 + 1.0 - min_size = p_im_info[b * 3 + 2] * rpn_min_size - - pred_score = p_score[((b * num_anchors * 2 + num_anchors + k) * height + h) * width + w] - pred_score = tvm.tir.Select( - tvm.tir.any(h >= real_height, w >= real_width), -1.0, pred_score - ) - p_out[out_index * 5 + 0] = pred_x1 - p_out[out_index * 5 + 1] = pred_y1 - p_out[out_index * 5 + 2] = pred_x2 - p_out[out_index * 5 + 3] = pred_y2 - p_out[out_index * 5 + 4] = pred_score - - with ib.if_scope(tvm.tir.any(bbox_w < min_size, bbox_h < min_size)): - p_out[out_index * 5 + 0] -= min_size / 2.0 - p_out[out_index * 5 + 1] -= min_size / 2.0 - p_out[out_index * 5 + 2] += min_size / 2.0 - p_out[out_index * 5 + 3] += min_size / 2.0 - p_out[out_index * 5 + 4] = -1.0 - - return ib.get() - - -def argsort_ir(data_buf, out_index_buf): - """Batched odd-even transposition sort. - - Parameters - ---------- - data_buf : tvm.te.schedule.Buffer - 2-D with shape [batch, num_bbox] - - out_index_buf : tvm.te.schedule.Buffer - 2-D with shape [batch, num_bbox]. Indices of data in sorted order. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - batch, num_bbox = get_const_tuple(data_buf.shape) - ib = tvm.tir.ir_builder.create() - p_data = ib.buffer_ptr(data_buf) - index_out = ib.buffer_ptr(out_index_buf) - temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local") - temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local") - idxm = tvm.tir.indexmod - with ib.for_range(0, batch, kind="unroll") as b: - start = b * num_bbox - for i in range(2): - with ib.for_range(0, (num_bbox + 1) // 2) as tid: - bbox_id = tid * 2 + i - with ib.if_scope(bbox_id < num_bbox): - index_out[start + bbox_id] = bbox_id - with ib.for_range(0, num_bbox) as k: - with ib.for_range(0, (num_bbox + 1) // 2) as tid: - offset = start + 2 * tid + idxm(k, 2) - with ib.if_scope( - tvm.tir.all(offset + 1 < num_bbox, p_data[offset] < p_data[offset + 1]) - ): - temp_data[0] = p_data[offset] - p_data[offset] = p_data[offset + 1] - p_data[offset + 1] = temp_data[0] - temp_index[0] = index_out[offset] - index_out[offset] = index_out[offset + 1] - index_out[offset + 1] = temp_index[0] - return ib.get() - - -def nms_ir(sorted_bbox_buf, out_buf, nms_threshold): - """Non-maximum suppression. - - Parameters - ---------- - sorted_bbox_buf : tvm.te.schedule.Buffer - 3-D with shape [batch, num_bbox, 5]. The last dimension is in format of - [w_start, h_start, w_end, h_end, score]. - - out_buf : tvm.te.schedule.Buffer - 2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed. - - nms_threshold : float - Non-maximum suppression threshold. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - - def calculate_overlap(out_tensor, box_a_idx, box_b_idx): - """Calculate overlap of two boxes.""" - w = tvm.te.max( - 0.0, - tvm.te.min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2]) - - tvm.te.max(out_tensor[box_a_idx], out_tensor[box_b_idx]) - + 1.0, - ) - h = tvm.te.max( - 0.0, - tvm.te.min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3]) - - tvm.te.max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]) - + 1.0, - ) - i = w * h - u = ( - (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx] + 1.0) - * (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1] + 1.0) - + (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx] + 1.0) - * (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1] + 1.0) - - i - ) - return i / u - - batch, num_bbox = get_const_tuple(out_buf.shape) - ib = tvm.tir.ir_builder.create() - p_data = ib.buffer_ptr(sorted_bbox_buf) - p_out = ib.buffer_ptr(out_buf) - with ib.for_range(0, batch, kind="unroll", name="n") as b: - base_idx = b * num_bbox - for i in range(num_bbox): - p_out[base_idx + i] = False - with ib.for_range(0, num_bbox - 1) as l: - with ib.for_range(0, num_bbox) as i: - with ib.if_scope(tvm.tir.all(i < num_bbox, i > l, p_out[base_idx + l] == False)): - iou = calculate_overlap(p_data, (base_idx + l) * 5, (base_idx + i) * 5) - with ib.if_scope(iou > nms_threshold): - p_out[base_idx + i] = True - return ib.get() - - -def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf): - """Copy output after applying nms to continuous memory. - - Parameters - ---------- - sorted_bbox_buf : tvm.te.schedule.Buffer - 3-D with shape [batch, num_bbox, 5]. The last dimension is in format of - [w_start, h_start, w_end, h_end, score]. - - remove_mask_buf : tvm.te.schedule.Buffer - 2-D with shape [batch, num_bbox]. Boolean mask of whether a bounding box should be removed. - - out_buf : tvm.te.schedule.Buffer - 2-D with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of - [batch_index, w_start, h_start, w_end, h_end]. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - batch, num_bbox, _ = get_const_tuple(sorted_bbox_buf.shape) - rpn_post_nms_top_n = get_const_int(out_buf.shape[0]) // batch - ib = tvm.tir.ir_builder.create() - i = ib.allocate("int32", (batch,), "i", scope="local") - p_sorted_bbox = ib.buffer_ptr(sorted_bbox_buf) - p_remove = ib.buffer_ptr(remove_mask_buf) - p_out = ib.buffer_ptr(out_buf) - - nkeep = ib.allocate("int32", (batch,), "nkeep", scope="local") - - with ib.for_range(0, batch) as b: - nkeep[b] = 0 - i[b] = 0 - - with ib.for_range(0, num_bbox) as j: - with ib.for_range(0, batch) as b: - with ib.if_scope(p_remove[b * num_bbox + j] == False): - nkeep[b] += 1 - with ib.for_range(0, batch) as b: - with ib.if_scope(nkeep[b] > 0): - with ib.for_range( - 0, te.ceil(tvm.tir.const(rpn_post_nms_top_n, "float32") / nkeep[b]).astype("int32") - ): - with ib.for_range(0, num_bbox) as j: - offset_j = (b * num_bbox + j) * 5 - offset_i = (b * rpn_post_nms_top_n + i[b]) * 5 - with ib.if_scope( - tvm.tir.all( - i[b] < rpn_post_nms_top_n, p_remove[(b * num_bbox + j)] == False - ) - ): - p_out[offset_i] = tvm.tir.Cast("float32", b) - with ib.for_range(0, 4, kind="unroll") as k: - p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k] - i[b] = i[b] + 1 - - body = ib.get() - return body - - -def proposal( - cls_prob, - bbox_pred, - im_info, - scales, - ratios, - feature_stride, - threshold, - rpn_pre_nms_top_n, - rpn_post_nms_top_n, - rpn_min_size, - iou_loss, -): - """Proposal operator. - - Parameters - ---------- - cls_prob : tvm.te.Tensor - 4-D with shape [batch, 2 * num_anchors, height, width] - - bbox_pred : tvm.te.Tensor - 4-D with shape [batch, 4 * num_anchors, height, width] - - im_info : tvm.te.Tensor - 2-D with shape [batch, 3] - - scales : list/tuple of float - Scales of anchor windows. - - ratios : list/tuple of float - Ratios of anchor windows. - - feature_stride : int - The size of the receptive field each unit in the convolution layer of the rpn, for example - the product of all stride's prior to this layer. - - threshold : float - Non-maximum suppression threshold. - - rpn_pre_nms_top_n : int - Number of top scoring boxes to apply NMS. -1 to use all boxes. - - rpn_post_nms_top_n : int - Number of top scoring boxes to keep after applying NMS to RPN proposals. - - rpn_min_size : int - Minimum height or width in proposal. - - iou_loss : bool - Usage of IoU loss. - - Returns - ------- - out : tvm.te.Tensor - 2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of - [batch_index, w_start, h_start, w_end, h_end]. - """ - # pylint: disable=unused-argument - batch, _, height, width = get_const_tuple(cls_prob.shape) - num_anchors = len(scales) * len(ratios) - num_bbox = height * width * num_anchors - rpn_pre_nms_top_n = min(rpn_pre_nms_top_n, num_bbox) if rpn_pre_nms_top_n > 0 else num_bbox - - bbox = te.extern( - (batch, num_bbox, 5), - [cls_prob, bbox_pred, im_info], - lambda ins, outs: predict_bbox_ir( - ins[0], ins[1], ins[2], outs[0], scales, ratios, feature_stride, rpn_min_size, iou_loss - ), - dtype=bbox_pred.dtype, - ) - score = te.compute((batch, num_bbox), lambda b, i: bbox[b, i, 4], tag="bbox_score") - valid_count_shape = (1,) - valid_count = te.compute(valid_count_shape, lambda i: num_bbox) - sorted_index = argsort(score, valid_count=valid_count, axis=1, is_ascend=False) - sorted_bbox = te.compute( - (batch, rpn_pre_nms_top_n, 5), - lambda b, i, j: bbox[b, sorted_index[b, i], j], - tag="sorted_bbox", - ) - nms_remove_mask = te.extern( - (batch, rpn_pre_nms_top_n), - [sorted_bbox], - lambda ins, outs: nms_ir(ins[0], outs[0], threshold), - dtype="bool", - ) - nms_out = te.extern( - (batch * rpn_post_nms_top_n, 5), - [sorted_bbox, nms_remove_mask], - lambda ins, outs: prepare_output_ir(ins[0], ins[1], outs[0]), - dtype=sorted_bbox.dtype, - ) - return nms_out diff --git a/python/tvm/topi/vision/rcnn/roi_align.py b/python/tvm/topi/vision/rcnn/roi_align.py deleted file mode 100644 index 238e02964356..000000000000 --- a/python/tvm/topi/vision/rcnn/roi_align.py +++ /dev/null @@ -1,228 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name -"""Roi align operator""" -import tvm -from tvm import te -from ...utils import get_const_tuple -from ...cpp.utils import bilinear_sample_nchw, bilinear_sample_nhwc - - -def _sample_common( - i, - c, - ph, - pw, - rois, - pooled_size_h, - pooled_size_w, - spatial_scale, - sample_ratio, - dtype, - avg_mode, - bilinear_func, -): - roi = rois[i] - batch_index = roi[0].astype("int32") - roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4] - roi_start_h *= spatial_scale - roi_end_h *= spatial_scale - roi_start_w *= spatial_scale - roi_end_w *= spatial_scale - - # force malformed ROIs to be 1x1 - roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype)) - roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype)) - - bin_h = roi_h / pooled_size_h - bin_w = roi_w / pooled_size_w - - if sample_ratio > 0: - roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(sample_ratio, "int32") - else: - roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32") - roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32") - - count = roi_bin_grid_h * roi_bin_grid_w - rh = te.reduce_axis((0, roi_bin_grid_h), name="rh") - rw = te.reduce_axis((0, roi_bin_grid_w), name="rw") - roi_start_h += ph * bin_h - roi_start_w += pw * bin_w - - if avg_mode: - return te.sum( - bilinear_func( - batch_index, - c, - roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, - roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w, - ) - / count, - axis=[rh, rw], - ) - # max mode - return te.max( - bilinear_func( - batch_index, - c, - roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, - roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w, - ), - axis=[rh, rw], - ) - - -def roi_align_nchw(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1): - """ROI align operator in NCHW layout. - - Parameters - ---------- - data : tvm.te.Tensor - 4-D with shape [batch, channel, height, width] - - rois : tvm.te.Tensor - 2-D with shape [num_roi, 5]. The last dimension should be in format of - [batch_index, w_start, h_start, w_end, h_end] - - pooled_size : int or list/tuple of two ints - output size, or [out_height, out_width] - - spatial_scale : float - Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal - of total stride in convolutional layers, which should be in range (0.0, 1.0] - - mode : int or str - There are two modes, average and max. For the average mode, you can pass b'avg' or 0, and - for the max mode, you can pass b'max' or 1. - - sample_ratio : int - Optional sampling ratio of ROI align, using adaptive size by default. - - Returns - ------- - output : tvm.te.Tensor - 4-D with shape [num_roi, channel, pooled_size, pooled_size] - """ - avg_mode = mode in (b"avg", 0) - max_mode = mode in (b"max", 1) - assert avg_mode or max_mode, "Mode must be avg or max. Please pass in a valid mode." - dtype = rois.dtype - _, channel, height, width = get_const_tuple(data.shape) - num_roi, _ = get_const_tuple(rois.shape) - - if isinstance(pooled_size, int): - pooled_size_h = pooled_size_w = pooled_size - else: - pooled_size_h, pooled_size_w = pooled_size - - def _bilinear(i, c, y, x): - outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width) - y = tvm.te.min(tvm.te.max(y, 0.0), height - 1) - x = tvm.te.min(tvm.te.max(x, 0.0), width - 1) - val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1) - return tvm.tir.if_then_else(outside, 0.0, val) - - def _sample(i, c, ph, pw): - return _sample_common( - i, - c, - ph, - pw, - rois, - pooled_size_h, - pooled_size_w, - spatial_scale, - sample_ratio, - dtype, - avg_mode, - _bilinear, - ) - - return te.compute( - (num_roi, channel, pooled_size_h, pooled_size_w), _sample, tag="pool,roi_align_nchw" - ) - - -def roi_align_nhwc(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1): - """ROI align operator in NHWC layout. - - Parameters - ---------- - data : tvm.te.Tensor - 4-D with shape [batch, height, width, channel] - - rois : tvm.te.Tensor - 2-D with shape [num_roi, 5]. The last dimension should be in format of - [batch_index, w_start, h_start, w_end, h_end] - - pooled_size : int or list/tuple of two ints - output size, or [out_height, out_width] - - spatial_scale : float - Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal - of total stride in convolutional layers, which should be in range (0.0, 1.0] - - mode : int or str - There are two modes, average and max. For the average mode, you can pass b'avg' or 0, and - for the max mode, you can pass b'max' or 1. - - sample_ratio : int - Optional sampling ratio of ROI align, using adaptive size by default. - - Returns - ------- - output : tvm.te.Tensor - 4-D with shape [num_roi, pooled_size, pooled_size, channel] - """ - avg_mode = mode in (b"avg", 0) - max_mode = mode in (b"max", 1) - assert avg_mode or max_mode, "Mode must be avg or max. Please pass in a valid mode." - dtype = rois.dtype - _, height, width, channel = get_const_tuple(data.shape) - num_roi, _ = get_const_tuple(rois.shape) - - if isinstance(pooled_size, int): - pooled_size_h = pooled_size_w = pooled_size - else: - pooled_size_h, pooled_size_w = pooled_size - - def _bilinear(i, c, y, x): - outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width) - y = tvm.te.min(tvm.te.max(y, 0.0), height - 1) - x = tvm.te.min(tvm.te.max(x, 0.0), width - 1) - val = bilinear_sample_nhwc(data, (i, y, x, c), height - 1, width - 1) - return tvm.tir.if_then_else(outside, 0.0, val) - - def _sample(i, ph, pw, c): - return _sample_common( - i, - c, - ph, - pw, - rois, - pooled_size_h, - pooled_size_w, - spatial_scale, - sample_ratio, - dtype, - avg_mode, - _bilinear, - ) - - return te.compute( - (num_roi, pooled_size_h, pooled_size_w, channel), _sample, tag="pool,roi_align_nchw" - ) diff --git a/python/tvm/topi/vision/rcnn/roi_pool.py b/python/tvm/topi/vision/rcnn/roi_pool.py deleted file mode 100644 index dd1429bcb3c5..000000000000 --- a/python/tvm/topi/vision/rcnn/roi_pool.py +++ /dev/null @@ -1,95 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name -"""ROI pool operator""" -import tvm -from tvm import te -from ...utils import get_const_tuple - - -def roi_pool_nchw(data, rois, pooled_size, spatial_scale): - """ROI pool operator in NCHW layout. - - Parameters - ---------- - data : tvm.te.Tensor - 4-D with shape [batch, channel, height, width] - - rois : tvm.te.Tensor - 2-D with shape [num_roi, 5]. The last dimension should be in format of - [batch_index, w_start, h_start, w_end, h_end] - - pooled_size : int or list/tuple of two ints - output size, or [out_height, out_width] - - spatial_scale : float - Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal - of total stride in convolutional layers, which should be in range (0.0, 1.0] - - Returns - ------- - output : tvm.te.Tensor - 4-D with shape [num_roi, channel, pooled_size, pooled_size] - """ - dtype = rois.dtype - _, channel, height, width = get_const_tuple(data.shape) - num_roi, _ = get_const_tuple(rois.shape) - - if isinstance(pooled_size, int): - pooled_size_h = pooled_size_w = pooled_size - else: - pooled_size_h, pooled_size_w = pooled_size - - def _pool(i, c, ph, pw): - roi = rois[i] - batch_index = roi[0].astype("int32") - roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4] - - roi_start_h = te.round(roi_start_h * spatial_scale).astype("int32") - roi_start_w = te.round(roi_start_w * spatial_scale).astype("int32") - roi_end_h = te.round(roi_end_h * spatial_scale).astype("int32") - roi_end_w = te.round(roi_end_w * spatial_scale).astype("int32") - - # force malformed ROIs to be 1x1 - roi_h = tvm.te.max(roi_end_h - roi_start_h + 1, tvm.tir.const(1, "int32")) - roi_w = tvm.te.max(roi_end_w - roi_start_w + 1, tvm.tir.const(1, "int32")) - - bin_h = roi_h.astype(dtype) / pooled_size_h - bin_w = roi_w.astype(dtype) / pooled_size_w - - # use epsilon to prevent floating point precision loss in floor/ceil - epsilon = tvm.tir.const(0.00001, dtype) - hstart = te.floor(ph * bin_h + epsilon).astype("int32") - wstart = te.floor(pw * bin_w + epsilon).astype("int32") - hend = te.ceil((ph + 1) * bin_h - epsilon).astype("int32") - wend = te.ceil((pw + 1) * bin_w - epsilon).astype("int32") - hstart = tvm.te.min(tvm.te.max(hstart + roi_start_h, 0), height) - wstart = tvm.te.min(tvm.te.max(wstart + roi_start_w, 0), width) - hend = tvm.te.min(tvm.te.max(hend + roi_start_h, 0), height) - wend = tvm.te.min(tvm.te.max(wend + roi_start_w, 0), width) - - non_empty = tvm.tir.all(hstart < hend, wstart < wend) - min_value = lambda dtype: tvm.tir.if_then_else( - non_empty, tvm.te.min_value(dtype), tvm.tir.const(0.0, dtype) - ) - # pylint: disable=unnecessary-lambda - _max = te.comm_reducer(lambda x, y: tvm.te.max(x, y), min_value, name="max") - rh = te.reduce_axis((0, hend - hstart), "rh") - rw = te.reduce_axis((0, wend - wstart), "rw") - return _max(data[batch_index, c, hstart + rh, wstart + rw], axis=[rh, rw]) - - return te.compute((num_roi, channel, pooled_size_h, pooled_size_w), _pool, tag="pool,roi_pool") diff --git a/python/tvm/topi/vision/reorg.py b/python/tvm/topi/vision/reorg.py deleted file mode 100644 index 9883085f9f40..000000000000 --- a/python/tvm/topi/vision/reorg.py +++ /dev/null @@ -1,42 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -REORG Operator -==================== -Reorg operator, used in darknet. -""" -from __future__ import absolute_import as _abs -from .. import cpp - - -def reorg(data, stride): - """Reorg forward operators. - - Parameters - ---------- - Input : tvm.te.Tensor - 4-D with shape [batch, in_channel, in_height, in_width] - - stride : int - Stride value for reorganization - - Returns - ------- - Output : tvm.te.Tensor - 4-D with shape [batch, out_channel, out_height, out_width] - """ - return cpp.vision.reorg(data, stride) diff --git a/python/tvm/topi/vision/ssd/__init__.py b/python/tvm/topi/vision/ssd/__init__.py deleted file mode 100644 index 1ac388da9a1e..000000000000 --- a/python/tvm/topi/vision/ssd/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# pylint: disable=wildcard-import -"""VISION network operators""" -from __future__ import absolute_import as _abs - -from .multibox import * diff --git a/python/tvm/topi/vision/ssd/multibox.py b/python/tvm/topi/vision/ssd/multibox.py deleted file mode 100644 index 234bfd795328..000000000000 --- a/python/tvm/topi/vision/ssd/multibox.py +++ /dev/null @@ -1,369 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable -"""SSD multibox operators""" -import tvm - -from tvm.te import hybrid -from tvm.tir import exp, sqrt - -from tvm import topi - -from ..nms import non_max_suppression - - -@hybrid.script -def hybrid_multibox_prior(data, sizes, ratios, steps, offsets): - """Hybrid routing for multibox_prior operator. - - Parameters - ---------- - data : tvm.te.Tensor or numpy NDArray - 4-D tensor with shape [batch, channel, height, width]] - - sizes : tvm ConsExpr - Sizes for anchor boxes. - - ratios : tvm ConsExpr - Ratios for anchor boxes. - - steps : tvm ConsExpr - Priorbox step across y and x, -1 for auto calculation. - - offsets : tvm ConsExpr - Priorbox center offsets, y and x respectively. - - Returns - ------- - output : tvm.te.Tensor or numpy NDArray - 3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4] - """ - in_height = data.shape[2] - in_width = data.shape[3] - num_sizes = len(sizes) - num_ratios = len(ratios) - num_boxes = in_height * in_width * (num_sizes + num_ratios - 1) - output = output_tensor((1, num_boxes, 4), "float32") - steps_h = steps[0] * 1.0 if steps[0] > 0 else 1.0 / in_height - steps_w = steps[1] * 1.0 if steps[1] > 0 else 1.0 / in_width - offset_h = offsets[0] - offset_w = offsets[1] - - # Need to define var out of const_range + if - w = 0.0 - h = 0.0 - - for i in parallel(in_height): - center_h = (i + offset_h) * steps_h - for j in range(in_width): - center_w = (j + offset_w) * steps_w - for k in const_range(num_sizes + num_ratios - 1): - if k < num_sizes: - w = float32(sizes[k] * in_height) / in_width / 2.0 - h = sizes[k] / 2.0 - else: - w = ( - float32(sizes[0] * in_height) - / in_width - * sqrt(ratios[k - num_sizes + 1] * 1.0) - / 2.0 - ) - h = sizes[0] / sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0 - count = ( - i * in_width * (num_sizes + num_ratios - 1) - + j * (num_sizes + num_ratios - 1) - + k - ) - output[0, count, 0] = center_w - w - output[0, count, 1] = center_h - h - output[0, count, 2] = center_w + w - output[0, count, 3] = center_h + h - - return output - - -def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): - """Generate prior(anchor) boxes from data, sizes and ratios. - - Parameters - ---------- - data : tvm.te.Tensor - 4-D with shape [batch, c_in, h_in, w_in]] - - sizes : tuple of float - Tuple of sizes for anchor boxes. - - ratios : tuple of float - Tuple of ratios for anchor boxes. - - steps : Tuple of float - Priorbox step across y and x, -1 for auto calculation. - - offsets : tuple of int - Priorbox center offsets, y and x respectively. - - clip : boolean - Whether to clip out-of-boundary boxes. - - Returns - ------- - out : tvm.te.Tensor - 3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4] - """ - out = hybrid_multibox_prior( - data, - tvm.runtime.convert(sizes), - tvm.runtime.convert(ratios), - tvm.runtime.convert(steps), - tvm.runtime.convert(offsets), - ) - if clip: - out = topi.clip(out, 0, 1) - return out - - -@hybrid.script -def _hybrid_transform_loc(anchor, pred_loc, variance, clip, batch_idx, anchor_idx): - """Transform prior anchor box to output box through location predictions.""" - al = anchor[0, anchor_idx, 0] - at = anchor[0, anchor_idx, 1] - ar = anchor[0, anchor_idx, 2] - ab = anchor[0, anchor_idx, 3] - - px = pred_loc[batch_idx, 0] - py = pred_loc[batch_idx, 1] - pw = pred_loc[batch_idx, 2] - ph = pred_loc[batch_idx, 3] - - vx = variance[0] - vy = variance[1] - vw = variance[2] - vh = variance[3] - - output = output_tensor((4,), pred_loc.dtype) - - aw = ar - al - ah = ab - at - ax = (al + ar) / 2.0 - ay = (at + ab) / 2.0 - ox = px * vx * aw + ax - oy = py * vy * ah + ay - ow = exp(pw * vw) * aw / 2.0 - oh = exp(ph * vh) * ah / 2.0 - output[0] = max(0.0, min(1.0, ox - ow)) if clip else ox - ow - output[1] = max(0.0, min(1.0, oy - oh)) if clip else oy - oh - output[2] = max(0.0, min(1.0, ox + ow)) if clip else ox + ow - output[3] = max(0.0, min(1.0, oy + oh)) if clip else oy + oh - return output - - -@hybrid.script -def hybrid_multibox_transform_loc( - cls_prob, - loc_pred, - anchor, - clip, - threshold, - variances, - keep_background, -): - """Hybrid routing for transform location in multibox_detection operator. - - Parameters - ---------- - cls_prob : tvm.te.Tensor or numpy NDArray - 3-D tensor of class probabilities. - - loc_pred : tvm.te.Tensor or numpy NDArray - 2-D tensor of location regression predictions. - - anchor : tvm.te.Tensor or numpy NDArray - 3-D tensor of prior anchor boxes. - - clip : tvm.tir.const - Whether to clip out-of-boundary boxes. - - threshold : tvm.tir.const - Threshold to be a positive prediction. - - variances : tvm.nd.NDArray - Variances to be decoded from box regression output. - - keep_background : tvm.tir.const - Whether to keep boxes detected as background or not. - - Returns - ------- - out_loc : tvm.te.Tensor or numpy NDArray - 3-D tensor of transformed location. - - valid_count : tvm.te.Tensor or numpy NDArray - 1_d tensor of valid counts for boxes. - """ - batch_size = cls_prob.shape[0] - num_classes = cls_prob.shape[1] - num_anchors = cls_prob.shape[2] - pred_coord = allocate( - ( - batch_size, - 4, - ), - loc_pred.dtype, - ) - out_loc = output_tensor((batch_size, num_anchors, 6), loc_pred.dtype) - valid_count = output_tensor((batch_size,), "int32") - - start_cls_idx = 0 if keep_background else 1 - - for i in parallel(batch_size): - valid_count[i] = 0 - for j in range(num_anchors): - # Find the predicted class id and probability - score = -1.0 - cls_id = 0 - for k in range(start_cls_idx, num_classes): - temp = cls_prob[i, k, j] - cls_id = k if temp > score else cls_id - score = max(temp, score) - if cls_id > 0 and score < threshold: - cls_id = 0 - # [id, prob, xmin, ymin, xmax, ymax] - # Remove background if 'keep_background=False', restore original id - if keep_background or cls_id > 0: - out_loc[i, valid_count[i], 0] = cls_id - 0.0 if keep_background else cls_id - 1.0 - out_loc[i, valid_count[i], 1] = score - for l in range(4): - pred_coord[i, l] = loc_pred[i, j * 4 + l] - out_coord = _hybrid_transform_loc(anchor, pred_coord, variances, clip, i, j) - out_loc[i, valid_count[i], 2] = out_coord[0] - out_loc[i, valid_count[i], 3] = out_coord[1] - out_loc[i, valid_count[i], 4] = out_coord[2] - out_loc[i, valid_count[i], 5] = out_coord[3] - valid_count[i] += 1 - - return out_loc, valid_count - - -def multibox_transform_loc( - cls_prob, - loc_pred, - anchor, - clip=True, - threshold=0.01, - variances=(0.1, 0.1, 0.2, 0.2), - keep_background=False, -): - """Location transformation for multibox detection - - Parameters - ---------- - cls_prob : tvm.te.Tensor - Class probabilities. - - loc_pred : tvm.te.Tensor - Location regression predictions. - - anchor : tvm.te.Tensor - Prior anchor boxes. - - clip : boolean - Whether to clip out-of-boundary boxes. - - threshold : float - Threshold to be a positive prediction. - - variances : tuple of float - Variances to be decoded from box regression output. - - keep_background : boolean - Whether to keep boxes detected as background or not. - - Returns - ------- - ret : tuple of tvm.te.Tensor - """ - - return hybrid_multibox_transform_loc( - cls_prob, - loc_pred, - anchor, - tvm.tir.const(clip, "bool"), - tvm.tir.const(threshold, "float32"), - tvm.runtime.convert(variances), - tvm.tir.const(keep_background, "bool"), - ) - - -def multibox_detection( - cls_prob, - loc_pred, - anchor, - clip=True, - threshold=0.01, - nms_threshold=0.5, - force_suppress=False, - variances=(0.1, 0.1, 0.2, 0.2), - nms_topk=-1, -): - """Convert multibox detection predictions. - - Parameters - ---------- - cls_prob : tvm.te.Tensor - Class probabilities. - - loc_pred : tvm.te.Tensor - Location regression predictions. - - anchor : tvm.te.Tensor - Prior anchor boxes. - - clip : boolean - Whether to clip out-of-boundary boxes. - - nms_threshold : float - Non-maximum suppression threshold. - - force_suppress : boolean - Whether to suppress all detections regardless of class_id. - - threshold : float - Threshold to be a positive prediction. - - variances : tuple of float - Variances to be decoded from box regression output. - - nms_topk : int - Keep maximum top k detections before nms, -1 for no limit. - - Returns - ------- - out : tvm.te.Tensor - 3-D tensor with shape (batch_size, num_anchors, 6) - """ - inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor, clip, threshold, variances) - out = non_max_suppression( - inter_out[0], - inter_out[1], - inter_out[1], - max_output_size=-1, - iou_threshold=nms_threshold, - force_suppress=force_suppress, - top_k=nms_topk, - return_indices=False, - ) - return out diff --git a/python/tvm/utils/__init__.py b/python/tvm/utils/__init__.py deleted file mode 100644 index 33abc352b0f0..000000000000 --- a/python/tvm/utils/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Utilities operating at a graph/model or other "high" level""" - -from .roofline import roofline_analysis diff --git a/python/tvm/utils/roofline/__init__.py b/python/tvm/utils/roofline/__init__.py deleted file mode 100644 index 45cc880c5b85..000000000000 --- a/python/tvm/utils/roofline/__init__.py +++ /dev/null @@ -1,279 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Utilities for computing an approximate roofline model""" -from typing import Dict, Optional, Union - -import numpy as np - -from ... import IRModule, auto_scheduler, build, get_global_func, nd, relay, tir, topi, transform -from ...contrib import utils -from ...ir.expr import GlobalVar -from ...ir.instrument import pass_instrument -from ...rpc.base import RPC_SESS_MASK -from ...rpc.client import RPCSession -from ...runtime import Device, num_threads, profiler_vm, profiling -from ...script import tir as T -from ...target import Target -from . import cuda, registry, x86 - - -def _create_args(mod: IRModule, dev: Device, func_name: str = "main", remote=None): - if dev.device_type >= RPC_SESS_MASK: - random_fill = remote.get_function("tvm.contrib.random.random_fill") - else: - random_fill = get_global_func("tvm.contrib.random.random_fill") - assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake" - args = [] - for arg in mod[func_name].params: - ary = nd.empty( - [x.value for x in arg.type_annotation.shape], - arg.type_annotation.dtype, - device=dev, - ) - random_fill(ary) - args.append(ary) - return args - - -@pass_instrument -class SaveLoweredTIR: - """Save TIR functions for analysis. - - We need the TIR function in a form that can be handled by - `auto_scheduler.feature.named_features_from_primfunc`, but which - is the closest to the final lowered form as possible. Right now this - means right before tir.SplitHostDevice. - - """ - - def __init__(self, before_pass: str = "tir.SplitHostDevice"): - """ - Parameters - ---------- - before_pass: str - Pass before which the TIR is saved. - """ - self.functions = {} - self.before_pass = before_pass - - def run_before_pass(self, mod, info): - if info.name == self.before_pass: - for v, func in mod.functions.items(): - if isinstance(func, tir.PrimFunc): - self.functions[v] = func - - -def roofline_from_existing( - report: profiling.Report, - tir_functions: Dict[GlobalVar, tir.PrimFunc], - target: Target, - dev: Device, - remote: Optional[RPCSession] = None, -) -> profiling.Report: - """Add roofline and other estimated statistics to an existing profiling report. - - :py:func:`roofline_analysis` should always be used instead of this function - unless you need a custom compilation pipeline. - - Calculating roofline statistics requires features extracted the TIR - functions in addition to per-operator runtime information (`report`) of the - same TIR features. The features and TIR functions are not included with the - compiled library used to generate the per-operator runtime. It is essential - that the per-operator information comes from the exact same compilation - pipeline as the TIR functions. - - - Example - ------- - - ..code: : python - - import tvm - import tvm.relay - - mod, params = tvm.relay.testing.mlp.get_workload() - - # it is recommended to use SaveLoweredTIR to get out the tir primfuncs - save_tir = tvm.utils.roofline.SaveLoweredTIR() - with tvm.transform.PassContext(opt_level=3, pass_instrument=[save_tir]): - lib = relay.vm.compile(mod, params=params, target=target) - - vmexec = profiler_vm.VirtualMachineProfiler(lib, dev) - report = vmexec.profile(*inputs) - - roofline_report = roofline_from_existing(report, save_tir.functions, target, dev) - - - Parameters - ---------- - report : Report - Existing profiling report from :py:method:`VirtualMachineProfiler.profile`. - tir_functions : Dict[GlobalVar, PrimFunc] - TIR primfuncs from the module run to generate `report`. It is nessesary - that these functions come before the `tir.MakePackedAPI` pass and are - compatible with auto_scheduler featurization. - :py:class:`SaveLoweredTIR` is the recommended way to collect these - functions. - target : Target - TVM target that `report` was generated with. - dev : Device - Device that `report` was generated with. - remote : Optional[RPCSession] - Remote session used to upload artifacts for runtime evaluation. Must be - the same session used to create `dev`. - - Returns - ------- - profiling.Report - New profiling report that includes all information from `report` - along with additional roofline metrics. See - :py:func:`roofline_analysis` for more information on which metrics - are included. - """ - - all_features = { - prim.attrs["hash"]: (name, prim, auto_scheduler.feature.named_features_from_primfunc(prim)) - for name, prim in tir_functions.items() - if isinstance(prim, tir.PrimFunc) and "hash" in prim.attrs.keys() - } - - new_configuration = dict(report.configuration.items()) - new_calls = [] - for call in report.calls: - if "Hash" in call.keys() and call["Hash"] in all_features: - _, prim, features = all_features[call["Hash"]] - if features is None: - continue - - with target: - flops, peak_flops, flops_name = registry.estimate_peak_flops( - prim, features, target, dev, remote - ) - loaded_bytes, peak_bandwidth, bandwidth_name = registry.estimate_peak_bandwidth( - prim, features, target, dev, remote - ) - new_configuration[f"Estimated Peak FLOP/s ({flops_name})"] = profiling.Ratio(peak_flops) - new_configuration[ - f"Estimated Peak Bandwidth ({bandwidth_name}, byte/second)" - ] = profiling.Ratio(peak_bandwidth) - ridge_point = peak_flops / peak_bandwidth - - runtime = call["Duration (us)"].microseconds * 1e-6 - arith_inten = flops / loaded_bytes - call = dict(call) - call["Loaded Bytes"] = profiling.Count(int(loaded_bytes)) - call["Estimated FLOPs"] = profiling.Count(int(flops)) - call["Arithmetic Intensity"] = profiling.Ratio(arith_inten) - call["FLOP/s"] = profiling.Ratio(flops / runtime) - call["Bandwidth"] = profiling.Ratio(loaded_bytes / runtime) - compute_bound = arith_inten > ridge_point - call["Bound"] = "compute" if compute_bound else "memory" - per_mem_bound = (loaded_bytes / runtime) / peak_bandwidth * 100 - per_compute_bound = (flops / runtime) / peak_flops * 100.0 - # We use ratio here because the percentages should be averaged instead of summed. - call["Percent of Theoretical Optimal"] = profiling.Ratio( - per_compute_bound if compute_bound else per_mem_bound - ) - new_calls.append(call) - else: - new_calls.append(call) - return profiling.Report(new_calls, report.device_metrics, new_configuration) - - -def roofline_analysis( - mod: IRModule, - params: Dict[str, nd.NDArray], - target: Union[str, Target], - dev: Device, - remote: Optional[RPCSession] = None, -) -> profiling.Report: - """ - Create a profiling report that contains roofline and other estimated - statistics from running a module on the VM. - - The roofline model measures how close a operator gets to best possible - memory bandwidth or FLOP/s depending on whether it is memory or compute - bound. This computation uses the runtime of the operator along with two - numbers extracted from the TIR code: bytes of memory touched and number of - floating point operations. - - These statistics are calculated by analyzing the lowered TIR of each - operator, so they are estimates of the true values. The statistics are: - - Bound: Is the operator memory or compute bound. This is computed by - assuming that the operator could perfectly cache all loads -- each byte - of memory is only loaded once. - - Percent of Theoretical Optimal: What percent of theoretical optimal for - the bound. i.e. percent of peak memory bandwidth if memory bound, - percent of peak FLOP/s if compute bound. - - Loaded Bytes: estimation of the number of bytes loaded from main memory. - - Estimated Flops: estimated number of floating point operations. - - Arithmetic Intensity: ratio of FLOPs per byte of data. - - FLOP/s: floating point operations per second. - - Bandwidth: Number of bytes loaded per second. - - Parameters - ---------- - mod : IRModule - Uncompiled input module - - params : Dict[str, nd.NDArray] - - target : Union[str, Target] - Target to run on. - - dev : Device - Device to run on. - - remote : Optional[RPCSession] - Remote session used to upload artifacts for runtime evaluation. Must be - the same session used to create `dev`. - - Returns - ------- - - report : profiling.Report - Profiling report which includes the estimated statistics. - """ - if isinstance(target, str): - target = Target(target) - - save_tir = SaveLoweredTIR() - # copy existing context but add our instrument - pass_ctx = transform.PassContext.current() - with transform.PassContext( - opt_level=pass_ctx.opt_level, - required_pass=pass_ctx.required_pass, - disabled_pass=pass_ctx.disabled_pass, - instruments=list(pass_ctx.instruments) + [save_tir], - config=pass_ctx.config, - ): - lib = relay.vm.compile(mod, params=params, target=target) - # upload to remote if running over rpc - if dev.device_type >= RPC_SESS_MASK: - if remote is None: - raise RuntimeError("A RPCSession must be provided when using a remote device.") - temp = utils.tempdir() - path = temp.relpath("roofline_lib.tar") - lib.mod.export_library(path) - remote.upload(path) - lib = remote.load_module("roofline_lib.tar") - vmexec = profiler_vm.VirtualMachineProfiler(lib, dev) - - args = _create_args(mod, dev, remote=remote) - report = vmexec.profile(*args) - - return roofline_from_existing(report, save_tir.functions, target, dev, remote=remote) diff --git a/python/tvm/utils/roofline/cuda.py b/python/tvm/utils/roofline/cuda.py deleted file mode 100644 index b83a902b7fda..000000000000 --- a/python/tvm/utils/roofline/cuda.py +++ /dev/null @@ -1,407 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Estimation of peak flops and memory bandwidth for cuda devices""" -import functools -import re -from typing import Dict, Optional, Tuple - -import numpy as np - -from ... import build, nd, transform -from ...contrib import nvcc, utils -from ...rpc.base import RPC_SESS_MASK -from ...rpc.client import RPCSession -from ...runtime import Device -from ...script import tir as T -from ...target import Target -from ...tir import PrimFunc -from . import registry - - -@functools.lru_cache(maxsize=None) -def estimate_peak_flops_tensorcore( - target: Target, - dev: Device, - remote: Optional[RPCSession], - mat_dtype: str = "float16", - acc_dtype: str = "float32", -) -> Tuple[float, float, str]: - """Estimate the peak FLOP/s of a cuda device with tensorcores. - - This estimate should only be used to compare with operators that can use - dense tensorcore mma instructions. - - References - ---------- - Wei Sun, Ang Li, Tong Geng, Sander Stuijk, Henk Corporaal: "Dissecting - Tensor Cores via Microbenchmarks: Latency, Throughput and Numerical - Behaviors", 2022; http://arxiv.org/abs/2206.02874 - https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf - - Parameters - ---------- - target : Target - Target to run on. This should be as specific to the actual hardware as - possible. - dev : Device - Device to run on. - remote : Optional[RPCSession] - Remote session used to upload artifacts for runtime evaluation. Must be - the same session used to create `dev`. - mat_dtype : str - Dtype of matrices passed to mma instructions. - acc_dtype : str - Dtype of accumulator to use with mma instructions. Should be compatible - with `mat_dtype`. - - Returns - ------- - peak_flops : float - Approximate sustained FLOP/s of this target/device combo assuming - mma instructions. Addition and multiplications are each counted as - separate FLOPs. - """ - - @T.prim_func - def peak_flops_tensorcore_tir( - inp: T.Buffer((16, 16), mat_dtype), - out: T.Buffer((16, 16), acc_dtype), - n: T.int32, - sms: T.int32, - ): - # pylint: disable=invalid-name, missing-function-docstring - A = T.alloc_buffer((16, 16), dtype=mat_dtype, scope="wmma.matrix_a") - B = T.alloc_buffer((16, 16), dtype=mat_dtype, scope="wmma.matrix_b") - C = T.alloc_buffer((16, 16), dtype=acc_dtype, scope="wmma.accumulator") - for _ in T.thread_binding(sms, thread="blockIdx.x"): - for _ in T.thread_binding( - 8, thread="threadIdx.y" - ): # need 8 warps to get enough in-SM parallelism - for _ in T.thread_binding(32, thread="threadIdx.x"): - T.evaluate( - T.tvm_load_matrix_sync( - A.data, - 16, - 16, - 16, - 0, - T.tvm_access_ptr( - T.type_annotation(dtype=mat_dtype), - inp.data, - 0, - 16, - 1, - dtype="handle", - ), - 16, - "row_major", - dtype="handle", - ) - ) - T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, 0, dtype="handle")) - T.evaluate(T.tvm_fill_fragment(C.data, 16, 16, 16, 0, 0, dtype="handle")) - for _ in range(n): - T.evaluate( - T.tvm_mma_sync( - C.data, 0, A.data, 0, B.data, 0, C.data, 0, dtype="handle" - ) - ) - T.evaluate( - T.tvm_store_matrix_sync( - C.data, - 16, - 16, - 16, - 0, - T.tvm_access_ptr( - T.type_annotation(dtype=acc_dtype), - out.data, - 0, - 16, - 2, - dtype="handle", - ), - 16, - "row_major", - dtype="handle", - ) - ) - - n = 100000 - sms = dev.multi_processor_count - specialized = peak_flops_tensorcore_tir.specialize( - {peak_flops_tensorcore_tir.params[2]: n, peak_flops_tensorcore_tir.params[3]: sms} - ) - with transform.PassContext(opt_level=3): - f = build(specialized, target=target) - - # upload to remote if running over rpc - if dev.device_type >= RPC_SESS_MASK: - if remote is None: - raise RuntimeError("A RPCSession must be provided when using a remote device.") - temp = utils.tempdir() - path = temp.relpath("peak_mma_flops.tar") - f.export_library(path) - remote.upload(path) - f = remote.load_module("peak_mma_flops.tar") - - x = nd.empty((16, 16), dtype=mat_dtype, device=dev) - y = nd.empty((16, 16), dtype=acc_dtype, device=dev) - times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(x, y) - # each mma operation computes 16 x 16 x 16 FLOPs - return n * 16 * 16 * 16 * 2 * sms * 8 / times.min - - -@functools.lru_cache(maxsize=None) -def estimate_peak_flops_fma( - target: Target, - dev: Device, - remote: Optional[RPCSession], - dtype: str, -) -> Tuple[float, float, str]: - """Estimate the peak FLOP/s of a cuda device with fma operations (not using tensor cores). - - References - ---------- - https://www.nvidia.com/content/PDF/nvidia-ampere-ga-102-gpu-architecture-whitepaper-v2.1.pdf - - Parameters - ---------- - target : Target - Target to run on. This should be as specific to the actual hardware as - possible. - dev : Device - Device to run on. - remote : Optional[RPCSession] - Remote session used to upload artifacts for runtime evaluation. Must be - the same session used to create `dev`. - dtype : str - Dtype of fma operation - - Returns - ------- - peak_flops : float - Approximate sustained FLOP/s of this target/device combo assuming - fma instructions. Addition and multiplications are each counted as - separate FLOPs. - """ - - vec_width = 32 - warps = 16 # need 16 warps to get enough in-SM parallelism - sms = dev.multi_processor_count - n = 100000 - - @T.prim_func - def peak_flops_fma_tir( - A: T.Buffer((sms, warps, vec_width), dtype), - B: T.Buffer((sms, warps, vec_width), dtype), - ): - # pylint: disable=invalid-name, missing-function-docstring - shared = T.alloc_buffer((sms, warps, vec_width), dtype=dtype, scope="shared") - for sm in T.thread_binding(sms, thread="blockIdx.x"): - for warp in T.thread_binding(warps, thread="threadIdx.y"): - for t in T.thread_binding(vec_width, thread="threadIdx.x"): - shared[sm, warp, t] = A[sm, warp, t] - for _ in range(n): - shared[sm, warp, t] = ( - shared[sm, warp, t] * shared[sm, warp, t] + shared[sm, warp, t] - ) - B[sm, warp, t] = shared[sm, warp, t] - - with transform.PassContext(opt_level=3): - f = build(peak_flops_fma_tir, target=target) - - # upload to remote if running over rpc - if dev.device_type >= RPC_SESS_MASK: - if remote is None: - raise RuntimeError("A RPCSession must be provided when using a remote device.") - temp = utils.tempdir() - path = temp.relpath("peak_fma_flops.tar") - f.export_library(path) - remote.upload(path) - f = remote.load_module("peak_fma_flops.tar") - - x = nd.empty((sms, warps, vec_width), dtype=dtype, device=dev) - y = nd.empty((sms, warps, vec_width), dtype=dtype, device=dev) - times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(x, y) - return n * warps * sms * vec_width * 2 / times.min - - -@registry.estimate_peak_flops.register("cuda") -def estimate_peak_flops( - func: PrimFunc, # pylint: disable=unused-argument - features: Dict[str, np.ndarray], - target: Target, - dev: Device, - remote: Optional[RPCSession], -) -> Tuple[float, float, str]: - """Estimate the peak FLOP/s of a cuda device. - - Parameters - ---------- - func : PrimFunc - Function to estimate peak flops for. Used to check if a specific kind - intrinsic or dtype could be used with this function. - features : Dict[str, np.ndarry] - Features extracted from `func`. Used to check if a specific kind - intrinsic or dtype could be used with this function. - target : Target - Target to run on. This should be as specific to the actual hardware as - possible. - dev : Device - Device to run on. - remote : Optional[RPCSession] - Remote session used to upload artifacts for runtime evaluation. Must be - the same session used to create `dev`. - - Returns - ------- - flops : float - Estimated number of flops used by `func`. - peak_flops : float - Approximate sustained FLOP/s of this target/device combo. Addition and - multiplications are each counted as separate FLOPs. - name : str - Dtype/intrinsic used by `func` to achieve peak flops. - """ - has_tensorcore = nvcc.have_tensorcore(dev.compute_version) - # assume that the first argument dtype is the same as all the others - dtype = list(func.buffer_map.values())[0].dtype - if dtype == "float16" and has_tensorcore: - peak_flops = estimate_peak_flops_tensorcore(target, dev, remote) - name = "float16 tensorcore" - else: - peak_flops = estimate_peak_flops_fma(target, dev, remote, dtype) - name = f"{dtype} fma" - flops = np.sum( - features["float_addsub"] - + features["float_mul"] - + features["float_mad"] * 2 - + features["float_divmod"] - ) - return flops, peak_flops, name - - -@T.prim_func -def peak_bandwidth_tir(a: T.handle, b: T.handle, blocks: T.int32, warp_size: T.int32) -> None: - # pylint: disable=invalid-name, missing-function-docstring - N = T.int32() - A = T.match_buffer(a, [blocks, N, 4, warp_size], "float32") - B = T.match_buffer(b, [blocks, 4, warp_size], "float32") - for i in T.thread_binding(blocks, "blockIdx.x"): - for k in T.serial(N): - for l in T.unroll(4): - # vectorized load is necessary to hit peak bandwidth - for j in T.thread_binding(warp_size, "threadIdx.x"): - # += is necessary to introduce a data dependency for all - # elements of A, preventing the backend from removing the - # `k` loop and setting `k` to the loop extent. - B[i, l, j] += A[i, k, l, j] - - -@functools.lru_cache(maxsize=None) -def estimate_peak_bandwidth_global_mem( - target: Target, - dev: Device, - remote: Optional[RPCSession] = None, -) -> Tuple[float, float, str]: - """Estimate peak bandwidth of global memory. See estimate_peak_bandwidth""" - warp_size = dev.warp_size - # These sizes seem large enough to give the card time to hit a fixpoint on memory bandwidth - blocks = 1024 - size = 1024 - - specialized = peak_bandwidth_tir.specialize( - {peak_bandwidth_tir.params[2]: blocks, peak_bandwidth_tir.params[3]: warp_size} - ) - with transform.PassContext(opt_level=3): - f = build(specialized, target=target) - - # upload to remote if running over rpc - if dev.device_type >= RPC_SESS_MASK: - if remote is None: - raise RuntimeError("A RPCSession must be provided when using a remote device.") - temp = utils.tempdir() - path = temp.relpath("peak_bandwidth.tar") - f.export_library(path) - remote.upload(path) - f = remote.load_module("peak_bandwidth.tar") - - a = nd.empty((blocks, size, 4, warp_size), dtype="float32", device=dev) - b = nd.empty((blocks, 4, warp_size), dtype="float32", device=dev) - times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b) - return a.numpy().size * 4 / times.min # 4 bytes per float32 - - -@registry.estimate_peak_bandwidth.register("cuda") -def estimate_peak_bandwidth( - func: PrimFunc, # pylint: disable=unused-argument - features: Dict[str, np.ndarray], - target: Target, - dev: Device, - remote: Optional[RPCSession] = None, -) -> Tuple[float, float, str]: - """Estimate peak memory bandwidth of a target/device combo. - - Peak bandwidth is estimated by running a small experiment on the underlying - hardware. The peak bandwidth measurement assumes that vector instructions - are being used to load the data. - - Parameters - ---------- - func : PrimFunc - Function to estimate peak bandwidth for. Used to check if a specific - kind of memory could be used with this function. - features : Dict[str, np.ndarry] - Features extracted from `func`. Used to check if a specific kind of - memory could be used with this function. - target : Target - Target to use for measurement. This target should be as specific to the - underlying hardware as possible. - dev : Device - Device to measure peak bandwidth on. - remote : Optional[RPCSession] - Remote session used to upload artifacts for runtime evaluation. Must be - the same session used to create `dev`. - - Returns - ------- - loaded_bytes : float - Estimated bytes loaded by `func`. - peak_bandwidth : float - Peak memory bandwidth in bytes/seconds. - name : str - Name of the memory being used. - """ - # autoscheduler features do not take into account that 1. - # global and shared memory have very different performance - # characteristics -- both are included in the same bytes - # touched count 2. multiple threads accessing the same byte - # of memory does not use the same amount of bandwidth as - # multiple threads accessing different bytes of memory. We - # use unique bytes accessed here to avoid these two issues, - # but this does bias results towards being more compute - # bound. - loaded_bytes = sum( - [ - np.sum(x) - for (k, x) in features.items() - if re.match(r"^B[0-9]+\.unique_bytes$", k) is not None - ] - ) - peak_bandwidth = estimate_peak_bandwidth_global_mem(target, dev, remote) - return loaded_bytes, peak_bandwidth, "global" diff --git a/python/tvm/utils/roofline/registry.py b/python/tvm/utils/roofline/registry.py deleted file mode 100644 index 9358529b38ec..000000000000 --- a/python/tvm/utils/roofline/registry.py +++ /dev/null @@ -1,111 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Definition of generic functions for estimating peak flops and bandwidth""" -from typing import Dict, Optional, Tuple - -import numpy as np - -from ...rpc.client import RPCSession -from ...runtime import Device -from ...target import Target, generic_func -from ...tir import PrimFunc - - -@generic_func -def estimate_peak_bandwidth( - func: PrimFunc, - features: Dict[str, np.ndarray], - target: Target, - dev: Device, - remote: Optional[RPCSession] = None, -) -> Tuple[float, float, str]: - """Estimate peak memory bandwidth of a target/device combo. - - Peak bandwidth is estimated by running a small experiment on the underlying - hardware. The peak bandwidth measurement assumes that vector instructions - are being used to load the data. - - Parameters - ---------- - func : PrimFunc - Function to estimate peak bandwidth for. Used to check if a specific - kind of memory could be used with this function. - features : Dict[str, np.ndarry] - Features extracted from `func`. Used to check if a specific kind of - memory could be used with this function. - target : Target - Target to use for measurement. This target should be as specific to the - underlying hardware as possible. - dev : Device - Device to measure peak bandwidth on. - remote : Optional[RPCSession] - Remote session used to upload artifacts for runtime evaluation. Must be - the same session used to create `dev`. - - Returns - ------- - loaded_bytes : float - Estimated bytes loaded by `func`. - peak_bandwidth : float - Peak memory bandwidth in bytes/seconds. - name : str - Name of the memory being used. - """ - raise NotImplementedError() - - -@generic_func -def estimate_peak_flops( - func: PrimFunc, - features: Dict[str, np.ndarray], - target: Target, - dev: Device, - remote: Optional[RPCSession], -) -> Tuple[float, float, str]: - """ - Estimate the maximum number of FLOP/s this target/device combo is capable - of reaching by running a test program. This is a generic function that - should be overridden for each target. - - Parameters - ---------- - func : PrimFunc - Function to estimate peak flops for. Used to check if a specific kind - intrinsic or dtype could be used with this function. - features : Dict[str, np.ndarry] - Features extracted from `func`. Used to check if a specific kind - intrinsic or dtype could be used with this function. - target : Target - Target to run on. This should be as specific to the actual hardware as - possible to make sure that LLVM generates the best vector code. - dev : Device - Device to run on. - remote : Optional[RPCSession] - Remote session used to upload artifacts for runtime evaluation. Must be - the same session used to create `dev`. - - Returns - ------- - flops : float - Estimated number of flops used by `func`. - peak_flops : float - Approximate sustained FLOP/s of this target/device combo assuming - vectorized FMA instructions. Each FMA operation counts as two FLOPs. - name : str - Dtype/intrinsic used by `func` to achieve peak flops. - """ - raise NotImplementedError() diff --git a/python/tvm/utils/roofline/x86.py b/python/tvm/utils/roofline/x86.py deleted file mode 100644 index 5d2dd27e523b..000000000000 --- a/python/tvm/utils/roofline/x86.py +++ /dev/null @@ -1,331 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Estimate peak flops and bandwidth for x86 devices""" -import functools -import re -from typing import Dict, Optional, Tuple - -import numpy as np - -from ... import build, get_global_func, nd, transform -from ...contrib import utils -from ...rpc.base import RPC_SESS_MASK -from ...rpc.client import RPCSession -from ...runtime import DataType, Device, num_threads -from ...script import tir as T -from ...target import Target, x86 -from ...tir import PrimFunc -from . import registry - - -def _detect_vec_width_registers( - target: Target, vec_width: Optional[int], num_vector_registers: Optional[int] -): - """Get the vector width and number of vector registers for a target. - - Parameters - ---------- - target : Target - Target to detect vector width and registers for. - vec_width : Optional[int] - If None, try and detect vector width from target. Otherwise provided input is used. - num_vector_registers : Optional[int] - If None, try and number of vector registers from target. Otherwise provided input is used. - - Returns - ------- - vec_width: int - Width of a vector register on `target` in bytes. - num_vector_registers: int - Number of vector registers on `target`. - """ - if vec_width is None: - # Only implemented for x86 so far... - if ( - str(target.kind) == "llvm" - and target.device_name == "" - and len(target.keys) == 1 - and target.keys[0] == "cpu" - ): - with target: - vec_width = x86.get_simd_32bit_lanes() * 4 # in number of bytes - else: - raise RuntimeError(f"Cannot determine vector width for target {target}") - if num_vector_registers is None: - if target.device_name == "": # indicates x86 - num_vector_registers = 16 # Assuming for all platforms, probably wrong on older ones - else: - raise RuntimeError(f"Cannot determine number of vector registers for target {target}") - return vec_width, num_vector_registers - - -@functools.lru_cache(maxsize=None) -def estimate_peak_fma_vector_flops( - target: Target, - dev: Device, - remote: Optional[RPCSession], - dtype: DataType, - vec_width: Optional[int] = None, - num_vector_registers: Optional[int] = None, -): - """Estimate peak flops assuming vector fma instructions and no explicit - intrinsics. See estimate_peak_fma_flops. - """ - - @T.prim_func - def peakflops_fma_tir( - a: T.handle, - vec_width: T.int32, - iters: T.int32, - num_vector_registers: T.int32, - threads: T.int32, - ) -> None: - # pylint: disable=invalid-name, missing-function-docstring - A = T.match_buffer(a, [threads, num_vector_registers, vec_width], dtype) - for t in T.parallel(threads): - for _j in range(iters): - for l in T.unroll(num_vector_registers): - # We want to use as few registers as possible, so we perform - # all operations on the same element - for k in T.vectorized(vec_width): - A[t, l, k] = A[t, l, k] * A[t, l, k] + A[t, l, k] - - vec_width, num_vector_registers = _detect_vec_width_registers( - target, vec_width, num_vector_registers - ) - vec_width //= DataType(dtype).bits // 8 - iters = 1000000 - nthreads = num_threads() - specialized = peakflops_fma_tir.specialize( - { - peakflops_fma_tir.params[1]: vec_width, - peakflops_fma_tir.params[2]: iters, - peakflops_fma_tir.params[3]: num_vector_registers, - peakflops_fma_tir.params[4]: nthreads, - } - ) - with transform.PassContext(opt_level=3): - f = build(specialized, target=target) - - # upload to remote if running over rpc - if dev.device_type >= RPC_SESS_MASK: - if remote is None: - raise RuntimeError("A RPCSession must be provided when using a remote device.") - temp = utils.tempdir() - path = temp.relpath("peak_fma_flops.tar") - f.export_library(path) - remote.upload(path) - f = remote.load_module("peak_fma_flops.tar") - random_fill = remote.get_function("tvm.contrib.random.random_fill") - else: - random_fill = get_global_func("tvm.contrib.random.random_fill") - assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake" - - a = nd.empty((nthreads, num_vector_registers, vec_width), dtype=dtype, device=dev) - random_fill(a) - times = f.time_evaluator(f.entry_name, dev, repeat=100, number=1)(a) - flops = 2 * vec_width * num_vector_registers * nthreads * iters # fma is two flops - return flops / times.min - - -@registry.estimate_peak_flops.register("cpu") -def estimate_peak_fma_flops( - func: PrimFunc, - features: Dict[str, np.ndarray], - target: Target, - dev: Device, - remote: Optional[RPCSession], - vec_width: Optional[int] = None, - num_vector_registers: Optional[int] = None, -) -> Tuple[float, float, str]: - """ - Estimate the maximum number of FLOP/s this target/device combo is capable - of reaching by running a test program. This assumes vectorized FMA - (fused-multiply-add) instructions. - - - Parameters - ---------- - func : PrimFunc - Function to estimate peak flops for. Used to check if a specific kind - intrinsic or dtype could be used with this function. - features : Dict[str, np.ndarry] - Features extracted from `func`. Used to check if a specific kind - intrinsic or dtype could be used with this function. - target : Target - Target to run on. This should be as specific to the actual hardware as - possible to make sure that LLVM generates the best vector code. - dev : Device - Device to run on. - remote : Optional[RPCSession] - Remote session used to upload artifacts for runtime evaluation. Must be - the same session used to create `dev`. - vec_width : Optional[int] - Vector width of SIMD units on the underlying hardware. Will try to - infer if no value is provided. - num_vector_registers : Optional[int] - Number of vector registers on the underlying hardware. Will try to - infer if no value is provided. - - Returns - ------- - flops : float - Estimated number of flops used by `func`. - peak_flops : float - Approximate sustained FLOP/s of this target/device combo assuming - vectorized FMA instructions. Each FMA operation counts as two FLOPs. - name : str - Dtype/intrinsic used by `func` to achieve peak flops. - """ - # assume that the first argument's dtype is the one we want - dtype = list(func.buffer_map.values())[0].dtype - if "int" in dtype: - flops = np.sum( - features["int_addsub"] - + features["int_mul"] - + features["int_mad"] * 2 - + features["int_divmod"] - ) - else: - flops = np.sum( - features["float_addsub"] - + features["float_mul"] - + features["float_mad"] * 2 - + features["float_divmod"] - ) - peak_flops = estimate_peak_fma_vector_flops( - target, dev, remote, dtype, vec_width, num_vector_registers - ) - return flops, peak_flops, f"{dtype} FMA" - - -@T.prim_func -def peak_bandwidth_tir(a: T.handle, b: T.handle, threads: T.int32, vec_width: T.int32) -> None: - # pylint: disable=invalid-name, missing-function-docstring - N = T.int32() - A = T.match_buffer(a, [threads, N, 4, vec_width], "float32") - B = T.match_buffer(b, [threads, 4, vec_width], "float32") - # Parallelism is necessary to hit all cores/nodes - for i in T.parallel(threads): - for k in T.serial(N): - for l in T.unroll(4): - # vectorized load is necessary to hit peak bandwidth - for j in T.vectorized(vec_width): - # += is necessary to introduce a data dependency for all - # elements of A, preventing the backend from removing the - # `k` loop and setting `k` to the loop extent. - B[i, l, j] += A[i, k, l, j] - - -@functools.lru_cache(maxsize=None) -def estimate_peak_bandwidth_dram( - target: Target, - dev: Device, - remote: Optional[RPCSession], - vec_width: Optional[int] = None, -) -> float: - """Estimate peak bandwidth for DRAM. See estimate_peak_bandwidth.""" - vec_width, _ = _detect_vec_width_registers(target, vec_width, 1) - specialized = peak_bandwidth_tir.specialize( - { - peak_bandwidth_tir.params[3]: vec_width, - } - ) - with transform.PassContext(opt_level=3): - f = build(specialized, target=target) - - # upload to remote if running over rpc - if dev.device_type >= RPC_SESS_MASK: - if remote is None: - raise RuntimeError("A RPCSession must be provided when using a remote device.") - temp = utils.tempdir() - path = temp.relpath("peak_bandwidth.tar") - f.export_library(path) - remote.upload(path) - f = remote.load_module("peak_bandwidth.tar") - random_fill = remote.get_function("tvm.contrib.random.random_fill") - else: - random_fill = get_global_func("tvm.contrib.random.random_fill") - assert random_fill, "Please make sure USE_RANDOM is ON in config.cmake" - - threads = num_threads() - # Data size needs to be larger than last level of cache. We don't have a - # way of getting cache sizes, so this number should give us a large enough - # size. - size = 10**8 // (4 * threads * vec_width) - a = nd.empty((threads, size, 4, vec_width), dtype="float32", device=dev) - random_fill(a) - b = nd.empty((threads, 4, vec_width), dtype="float32", device=dev) - random_fill(b) - times = f.time_evaluator(f.entry_name, dev, repeat=10, number=1)(a, b, threads) - return a.numpy().size * 4 / times.min # 4 bytes per float32 - - -@registry.estimate_peak_bandwidth.register("cpu") -def estimate_peak_bandwidth( - func: PrimFunc, # pylint: disable=unused-argument - features: Dict[str, np.ndarray], - target: Target, - dev: Device, - remote: Optional[RPCSession], - vec_width: Optional[int] = None, -) -> Tuple[float, float, str]: - """Estimate peak memory bandwidth of a target/device combo. - - Peak bandwidth is estimated by running a small experiment on the underlying - hardware. The peak bandwidth measurement assumes that vector instructions - are being used to load the data. - - Parameters - ---------- - func : PrimFunc - Function to estimate peak bandwidth for. Used to check if a specific - kind of memory could be used with this function. - features : Dict[str, np.ndarry] - Features extracted from `func`. Used to check if a specific kind of - memory could be used with this function. - target : Target - Target to use for measurement. This target should be as specific to the - underlying hardware as possible. - dev : Device - Device to measure peak bandwidth on. - remote : Optional[RPCSession] - Remote session used to upload artifacts for runtime evaluation. Must be - the same session used to create `dev`. - vec_width : Optional[int] - Vector unit width, determined from target if not supplied. - - Returns - ------- - loaded_bytes : float - Estimated bytes loaded by `func`. - peak_bandwidth : float - Peak memory bandwidth in bytes/seconds. - name : str - Name of the memory being used. - """ - # Ideally we'd be able to use this code to measure peak bandwidth of the - # different cache levels. If we could just generate load commands, then we - # could use those in a tight loop. Instead we need some code that is - # limited on the cache bandwidth. With the L1 cache we need an operation - # that has a very low arithmetic intensity and we haven't come up with one - # yet. - peak_bandwidth = estimate_peak_bandwidth_dram(target, dev, remote, vec_width) - loaded_bytes = sum( - [np.sum(x) for (k, x) in features.items() if re.match(r"^B[0-9]+\.bytes$", k) is not None] - ) - return loaded_bytes, peak_bandwidth, "DRAM" diff --git a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py index bc3ae64b46c1..67d5d84a0c1d 100644 --- a/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py +++ b/tests/python/all-platform-minimal-test/test_minimal_target_codegen_llvm.py @@ -30,7 +30,7 @@ @tvm.testing.requires_llvm def test_llvm_add_pipeline(): """all-platform-minimal-test: Check LLVM enablement.""" - nn = 1024 + nn = 128 n = tvm.runtime.convert(nn) A = te.placeholder((n,), name="A") B = te.placeholder((n,), name="B") @@ -38,23 +38,15 @@ def test_llvm_add_pipeline(): BB = te.compute((n,), lambda *i: B(*i), name="B") T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T") C = te.compute(A.shape, lambda *i: T(*i), name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=4) - xo1, xo2 = s[C].split(xo, factor=13) - s[C].parallel(xo2) - s[C].pragma(xo1, "parallel_launch_point") - s[C].pragma(xo2, "parallel_stride_pattern") - s[C].pragma(xo2, "parallel_barrier_when_finish") - s[C].vectorize(xi) + + sch = tvm.tir.Schedule(te.create_prim_func([A, B, C])) + xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 4]) + sch.parallel(xo) + sch.vectorize(xi) def check_llvm(): - # Specifically allow offset to test codepath when offset is available - Ab = tvm.tir.decl_buffer( - A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A" - ) - binds = {A: Ab} # BUILD and invoke the kernel. - f = tvm.build(s, [A, B, C], "llvm", binds=binds) + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. n = nn diff --git a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py index 8f929b1c1a76..d01f9599ffe0 100644 --- a/tests/python/all-platform-minimal-test/test_runtime_ndarray.py +++ b/tests/python/all-platform-minimal-test/test_runtime_ndarray.py @@ -69,28 +69,6 @@ def test_memory_usage(target, dev, dtype): assert dev.available_global_memory == available_memory_before -@pytest.mark.skip(reason="Skip for passing windows test on CI") -def test_fp16_conversion(): - n = 100 - - for src, dst in [("float32", "float16"), ("float16", "float32")]: - A = te.placeholder((n,), dtype=src) - B = te.compute((n,), lambda i: A[i].astype(dst)) - - s = te.create_schedule([B.op]) - func = tvm.build(s, [A, B], "llvm") - - x_tvm = tvm.nd.array(100 * np.random.randn(n).astype(src) - 50) - y_tvm = tvm.nd.array(100 * np.random.randn(n).astype(dst) - 50) - - func(x_tvm, y_tvm) - - expected = x_tvm.numpy().astype(dst) - real = y_tvm.numpy() - - tvm.testing.assert_allclose(expected, real) - - def test_dtype(): dtype = tvm.DataType("handle") assert dtype.type_code == tvm.DataTypeCode.HANDLE diff --git a/tests/python/codegen/test_target_codegen_aarch64.py b/tests/python/codegen/test_target_codegen_aarch64.py index 366198c7de6a..8bd0cb17267d 100644 --- a/tests/python/codegen/test_target_codegen_aarch64.py +++ b/tests/python/codegen/test_target_codegen_aarch64.py @@ -43,9 +43,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: A[i] * B[i], name="C") - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and mul instructions using z registers assembly = f.get_source("asm") @@ -75,9 +73,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: A[i] + B[i], name="C") - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and add instructions using z registers assembly = f.get_source("asm") @@ -107,9 +103,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: A[i] - B[i], name="C") - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and sub instructions using z registers assembly = f.get_source("asm") @@ -140,9 +134,7 @@ def check_correct_assembly(type): B = te.placeholder(m, dtype=type, name="B") C = te.placeholder(m, dtype=type, name="C") D = te.compute((m), lambda i: A[i] * B[i] + C[i], name="D") - s = te.create_schedule([D.op]) - - f = tvm.build(s, [A, B, C, D], target) + f = tvm.build(te.create_prim_func([A, B, C, D]), target=target) # Verify we see SVE load instructions and either mad or mla instructions using z registers assembly = f.get_source("asm") @@ -172,9 +164,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: tvm.te.max(A[i], B[i])) - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and cmgt + sel instructions or a max instruction, all using z registers assembly = f.get_source("asm") @@ -208,9 +198,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: tvm.te.min(A[i], B[i])) - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and cmgt + sel instructions or a min instruction, all using z registers assembly = f.get_source("asm") @@ -244,9 +232,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: tvm.te.div(A[i], B[i])) - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and div instructions using z registers assembly = f.get_source("asm") @@ -256,7 +242,7 @@ def check_correct_assembly(type): ) assert len(loads) > 1 - assert len(matches) > 1 + assert len(matches) >= 1 check_correct_assembly(type=dtype) @@ -275,9 +261,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: tvm.te.floormod(A[i], B[i]), name="C") - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and mls instructions using z registers assembly = f.get_source("asm") @@ -307,9 +291,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: A[i] == B[i], name="C") - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and cmpeq or cmeq instructions using z registers assembly = f.get_source("asm") @@ -339,9 +321,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: A[i] != B[i], name="C") - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and cmpgt, cmgt, cmpne or cmne instructions, all using z registers assembly = f.get_source("asm") @@ -370,9 +350,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: A[i] | B[i], name="C") - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and orr instructions using z registers assembly = f.get_source("asm") @@ -401,9 +379,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype=type, name="B") C = te.compute((m), lambda i: A[i] & B[i], name="C") - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see SVE load instructions and and instructions using z registers assembly = f.get_source("asm") @@ -431,9 +407,7 @@ def check_correct_assembly(type): m = te.var("m") A = te.placeholder(m, dtype=type, name="A") C = te.compute((m), lambda i: ~A[i], name="C") - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, C], target) + f = tvm.build(te.create_prim_func([A, C]), target=target) # Verify we see SVE load instructions and eor instructions using z registers assembly = f.get_source("asm") @@ -466,9 +440,7 @@ def check_correct_assembly(type): A = te.placeholder(m, dtype=type, name="A") B = te.placeholder(m, dtype="int32", name="B") C = te.compute((m), lambda i: A[B[i]], name="C") - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) # Verify we see gather instructions in the assembly assembly = f.get_source("asm") @@ -557,10 +529,7 @@ def test_vscale_range_function_attribute(mattr, expect_attr): m = te.var("m") A = te.placeholder(m, dtype="float32", name="A") C = te.compute((m), lambda i: A[i] + 1, name="C") - s = te.create_schedule([C.op]) - - with tvm.target.Target(target) as target: - f = tvm.build(s, [A, C], target) + f = tvm.build(te.create_prim_func([A, C]), target=target) # Check if the vscale_range() attribute exists ll = f.get_source("ll") diff --git a/tests/python/codegen/test_target_codegen_arm.py b/tests/python/codegen/test_target_codegen_arm.py index b5c69d6df1a6..9357d38e667b 100644 --- a/tests/python/codegen/test_target_codegen_arm.py +++ b/tests/python/codegen/test_target_codegen_arm.py @@ -28,10 +28,9 @@ def check_correct_assembly(type, elements, counts): n = tvm.runtime.convert(elements) A = te.placeholder(n, dtype=type, name="A") B = te.compute(A.shape, lambda i: tvm.tir.popcount(A[i]), name="B") - s = te.create_schedule(B.op) - s[B].vectorize(s[B].op.axis[0]) - f = tvm.build(s, [A, B], target) - + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + sch.vectorize(sch.get_loops("B")[0]) + f = tvm.build(sch.mod, target=target) # Verify we see the correct number of vpaddl and vcnt instructions in the assembly assembly = f.get_source("asm") matches = re.findall("vpaddl", assembly) @@ -59,9 +58,9 @@ def check_correct_assembly(N): lambda n: te.sum(A[k, n].astype("int32") * B[k, n].astype("int32"), axis=[k]), name="C", ) - s = te.create_schedule(C.op) - s[C].vectorize(s[C].op.axis[0]) - f = tvm.build(s, [A, B, C], target) + sch = tvm.tir.Schedule(te.create_prim_func([A, B, C])) + sch.vectorize(sch.get_loops("C")[0]) + f = tvm.build(sch.mod, target=target) # Verify we see the correct number of vmlal.s16 instructions assembly = f.get_source("asm") @@ -83,9 +82,9 @@ def check_broadcast_correct_assembly(N): lambda n: te.sum(A[k, n].astype("int32") * B[k].astype("int32"), axis=[k]), name="C", ) - s = te.create_schedule(C.op) - s[C].vectorize(s[C].op.axis[0]) - f = tvm.build(s, [A, B, C], target) + sch = tvm.tir.Schedule(te.create_prim_func([A, B, C])) + sch.vectorize(sch.get_loops("C")[0]) + f = tvm.build(sch.mod, target=target) # Verify we see the correct number of vmlal.s16 instructions assembly = f.get_source("asm") diff --git a/tests/python/codegen/test_target_codegen_bool.py b/tests/python/codegen/test_target_codegen_bool.py index b9f4437110c8..a575c0cec9c9 100644 --- a/tests/python/codegen/test_target_codegen_bool.py +++ b/tests/python/codegen/test_target_codegen_bool.py @@ -35,29 +35,24 @@ def compute(arr_size): @tvm.testing.fixture -def schedule(target, compute): +def get_module(target, compute): target = tvm.target.Target(target) A, B, C, D = compute if target.kind.name == "llvm": - s = te.create_schedule(D.op) - xo, xi = s[C].split(C.op.axis[0], factor=4) - xo1, xo2 = s[C].split(xo, factor=13) - s[C].parallel(xo2) + return tvm.IRModule.from_expr(te.create_prim_func([A, B, D])) - else: - s = te.create_schedule(D.op) - for stage in [C, D]: - xo, xi = s[stage].split(stage.op.axis[0], factor=4) - s[stage].bind(xo, te.thread_axis("blockIdx.x")) - s[stage].bind(xi, te.thread_axis("threadIdx.x")) - - return s + sch = tvm.tir.Schedule(te.create_prim_func([A, B, D])) + for stage in ["C", "D"]: + xo, xi = sch.split(sch.get_loops(stage)[0], factors=[None, 4]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "blockIdx.x") + return sch.mod @tvm.testing.uses_gpu -def test_cmp_load_store(target, dev, arr_size, compute, schedule): +def test_cmp_load_store(target, dev, arr_size, compute, get_module): A, B, _, D = compute - f = tvm.build(schedule, [A, B, D], target) + f = tvm.build(get_module, target=target) a_np = np.random.uniform(size=arr_size).astype(A.dtype) b_np = np.random.uniform(size=arr_size).astype(B.dtype) diff --git a/tests/python/codegen/test_target_codegen_c_host.py b/tests/python/codegen/test_target_codegen_c_host.py index 3aca0fc8c77e..d7a7cbc8a44b 100644 --- a/tests/python/codegen/test_target_codegen_c_host.py +++ b/tests/python/codegen/test_target_codegen_c_host.py @@ -31,61 +31,19 @@ def test_add(): A = te.placeholder((n,), name="A") B = te.placeholder((n,), name="B") C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") - s = te.create_schedule(C.op) def check_c(): - mhost = tvm.build(s, [A, B, C], "c", name="test_fadd") - temp = utils.tempdir() - path_dso = temp.relpath("temp.so") - mhost.export_library(path_dso) - m = tvm.runtime.load_module(path_dso) - fadd = m["test_fadd"] - dev = tvm.cpu(0) - # launch the kernel. - n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - fadd(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - - check_c() - - -def test_add_pipeline(): - nn = 1024 - n = tvm.runtime.convert(nn) - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - AA = te.compute((n,), lambda *i: A(*i), name="A") - BB = te.compute((n,), lambda *i: B(*i), name="B") - T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T") - C = te.compute(A.shape, lambda *i: T(*i), name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=4) - xo1, xo2 = s[C].split(xo, factor=13) - s[C].parallel(xo2) - s[C].pragma(xo1, "parallel_launch_point") - s[C].pragma(xo2, "parallel_stride_pattern") - s[C].pragma(xo2, "parallel_barrier_when_finish") - # FIXME(tvm-team): vector operators are not supported for codegen to C yet - # s[C].vectorize(xi) - - def check_c(): - # Specifically allow offset to test codepath when offset is available - Ab = tvm.tir.decl_buffer( - A.shape, A.dtype, elem_offset=te.size_var("Aoffset"), offset_factor=8, name="A" + mhost = tvm.build( + tvm.IRModule.from_expr( + te.create_prim_func([A, B, C]).with_attr("global_symbol", "test_fadd") + ), + target="c", ) - binds = {A: Ab} - # BUILD and invoke the kernel. - f1 = tvm.lower(s, [A, B, C], name="test_fadd_pipeline") - mhost = tvm.build(f1, target="c") - temp = utils.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.runtime.load_module(path_dso) - fadd = m["test_fadd_pipeline"] + fadd = m["test_fadd"] dev = tvm.cpu(0) # launch the kernel. n = nn @@ -105,10 +63,14 @@ def test_reinterpret(): B = te.compute( A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.reinterpret", 2 + A(*i)), name="B" ) - s = te.create_schedule(B.op) def check_c(): - mhost = tvm.build(s, [A, B], "c", name="test_reinterpret") + mhost = tvm.build( + tvm.IRModule.from_expr( + te.create_prim_func([A, B]).with_attr("global_symbol", "test_reinterpret") + ), + target="c", + ) temp = utils.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) @@ -129,10 +91,14 @@ def test_ceil(): n = tvm.runtime.convert(nn) A = te.placeholder((n,), name="A", dtype="float32") B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.ceil", A(*i)), name="B") - s = te.create_schedule(B.op) def check_c(): - mhost = tvm.build(s, [A, B], "c", name="test_ceil") + mhost = tvm.build( + tvm.IRModule.from_expr( + te.create_prim_func([A, B]).with_attr("global_symbol", "test_ceil") + ), + target="c", + ) temp = utils.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) @@ -153,10 +119,14 @@ def test_floor(): n = tvm.runtime.convert(nn) A = te.placeholder((n,), name="A", dtype="float32") B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.floor", A(*i)), name="B") - s = te.create_schedule(B.op) def check_c(): - mhost = tvm.build(s, [A, B], "c", name="test_floor") + mhost = tvm.build( + tvm.IRModule.from_expr( + te.create_prim_func([A, B]).with_attr("global_symbol", "test_floor") + ), + target="c", + ) temp = utils.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) @@ -177,10 +147,14 @@ def test_round(): n = tvm.runtime.convert(nn) A = te.placeholder((n,), name="A", dtype="float32") B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.round", A(*i)), name="B") - s = te.create_schedule(B.op) def check_c(): - mhost = tvm.build(s, [A, B], "c", name="test_round") + mhost = tvm.build( + tvm.IRModule.from_expr( + te.create_prim_func([A, B]).with_attr("global_symbol", "test_round") + ), + target="c", + ) temp = utils.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) @@ -196,42 +170,6 @@ def check_c(): check_c() -def test_call_packed(): - def fake_func(fname="fake.func"): - ib = tvm.tir.ir_builder.create() - A = ib.pointer("float32", name="A") - fake_func1 = tvm.tir.call_packed(fname, A[0]) - - ib.emit(fake_func1) - body = ib.get() - return A, body - - def check_global_packed_func(): - fname = "fake.func" - A, body = fake_func(fname) - func1 = tvm.tir.PrimFunc([A], body).with_attr("global_symbol", "func1") - B, body = fake_func() - func2 = tvm.tir.PrimFunc([B], body).with_attr("global_symbol", "func2") - mod = tvm.IRModule({"fake_func1": func1, "fake_func2": func2}) - fcode = tvm.build(mod, None, "c") - src = fcode.get_source() - - # there are two locations calling the packed func - assert src.count(fname) == 2 - - suffix = "_packed" - packed_func_name = fname + suffix - # func name will be standardized by GetUniqueName and not exists anymore - assert src.find(packed_func_name) == -1 - - packed_func_real_name = "_".join(fname.split(".")) + suffix - func_declaration = "static void* %s = NULL;" % packed_func_real_name - # src only has 1 valid declaration - assert src.count(func_declaration) == 1 - - check_global_packed_func() - - def test_subroutine_call(): @I.ir_module class mod: diff --git a/tests/python/codegen/test_target_codegen_cross_llvm.py b/tests/python/codegen/test_target_codegen_cross_llvm.py index 8758ae2a04e8..9dc001e1949a 100644 --- a/tests/python/codegen/test_target_codegen_cross_llvm.py +++ b/tests/python/codegen/test_target_codegen_cross_llvm.py @@ -32,10 +32,11 @@ def test_llvm_add_pipeline(): A = te.placeholder((n,), name="A") B = te.placeholder((n,), name="B") C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=4) - s[C].parallel(xo) - s[C].vectorize(xi) + + sch = tvm.tir.Schedule(te.create_prim_func([A, B, C])) + xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 4]) + sch.parallel(xo) + sch.vectorize(xi) def verify_elf(path, e_machine): with open(path, "rb") as fi: @@ -48,7 +49,7 @@ def verify_elf(path, e_machine): def build_i386(): temp = utils.tempdir() target = "llvm -mtriple=i386-pc-linux-gnu" - f = tvm.build(s, [A, B, C], target) + f = tvm.build(sch.mod, target=target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x03) @@ -59,7 +60,7 @@ def build_arm(): print("Skip because %s is not enabled.." % target) return temp = utils.tempdir() - f = tvm.build(s, [A, B, C], target) + f = tvm.build(sch.mod, target=target) path = temp.relpath("myadd.o") f.save(path) verify_elf(path, 0x28) diff --git a/tests/python/codegen/test_target_codegen_cuda.py b/tests/python/codegen/test_target_codegen_cuda.py index 7b370f3e3211..ae3173a14dee 100644 --- a/tests/python/codegen/test_target_codegen_cuda.py +++ b/tests/python/codegen/test_target_codegen_cuda.py @@ -28,9 +28,6 @@ import tvm.testing import pytest -tx = te.thread_axis("threadIdx.x") -bx = te.thread_axis("blockIdx.x") - @tvm.testing.requires_gpu @tvm.testing.requires_cuda @@ -46,11 +43,13 @@ def check_cuda(dtype, n, lanes): return A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes)) B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B") - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(xo, bx) - s[B].bind(xi, tx) - fun = tvm.build(s, [A, B], "cuda") + + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, num_thread]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") + fun = tvm.build(sch.mod, target="cuda") + dev = tvm.cuda(0) a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), B.dtype, dev) @@ -96,14 +95,15 @@ def np_bf162np_float(arr): def check_cuda(n, lanes): A = te.placeholder((n,), name="A", dtype="bfloat16x%d" % lanes) B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B") - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(xo, bx) - s[B].bind(xi, tx) + + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, num_thread]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") with tvm.transform.PassContext( disabled_pass=["tir.BF16Promote", "tir.BF16CastElimination", "tir.BF16TypeLowering"] ): - fun = tvm.build(s, [A, B], "cuda") + fun = tvm.build(sch.mod, target="cuda") dev = tvm.cuda(0) np_a = np.random.uniform(size=(n, lanes)).astype("float32") np_a = np_bf162np_float(np_float2np_bf16(np_a)) @@ -134,11 +134,12 @@ def check_cuda(dtype, n, lanes): D = te.compute( (n,), lambda i: tvm.tir.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name="D" ) - s = te.create_schedule(D.op) - xo, xi = s[D].split(D.op.axis[0], factor=num_thread) - s[D].bind(xo, bx) - s[D].bind(xi, tx) - fun = tvm.build(s, [A, B, C, D], "cuda") + sch = tvm.tir.Schedule(te.create_prim_func([A, B, C, D])) + xo, xi = sch.split(sch.get_loops("D")[0], factors=[None, num_thread]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") + fun = tvm.build(sch.mod, target="cuda") + np_a = np.random.randint(low=-128, high=127, size=(n, lanes)) np_b = np.random.randint(low=-128, high=127, size=(n, lanes)) np_c = np.random.randint(low=0, high=127, size=(n,)) @@ -163,11 +164,13 @@ def check_cuda(dtype, n, lanes): dev = tvm.cuda(0) A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes)) B = te.compute((n,), lambda i: A[i], name="B") - s = te.create_schedule(B.op) - block, thread = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(block, bx) - s[B].bind(thread, tx) - fun = tvm.build(s, [A, B], "cuda", name="vector_load") + + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, num_thread]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") + fun = tvm.build(sch.mod, target="cuda") + np_a = np.random.randint(low=-128, high=127, size=(n, lanes)) a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np_a) b = tvm.nd.empty((n,), B.dtype, dev) @@ -187,12 +190,14 @@ def test_cuda_make_int8(): def check_cuda(n, value, lanes): dtype = "int8" dev = tvm.cuda(0) - A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype)) - s = te.create_schedule(A.op) - y, x = s[A].op.axis - s[A].vectorize(x) - s[A].bind(y, bx) - fun = tvm.build(s, [A], "cuda", name="make_int8x4") + A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype), name="A") + + sch = tvm.tir.Schedule(te.create_prim_func([A])) + y, x = sch.get_loops("A") + sch.vectorize(x) + sch.bind(y, "blockIdx.x") + fun = tvm.build(sch.mod, target="cuda") + np_a = np.full((n, lanes), value, dtype=dtype) a = tvm.nd.empty(np_a.shape, dtype, dev) fun(a) @@ -215,13 +220,13 @@ def test_cuda_make_int4(): def check_cuda(n, value, lanes): dtype = "int4" dev = tvm.cuda(0) - A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype)) - s = te.create_schedule(A.op) - y, x = s[A].op.axis - s[A].vectorize(x) - s[A].bind(y, bx) - kernel_name = "make_int4x" + str(lanes) - fun = tvm.build(s, [A], "cuda", name=kernel_name) + A = te.compute((n, lanes), lambda i, j: tvm.tir.const(value, dtype=dtype), name="A") + sch = tvm.tir.Schedule(te.create_prim_func([A])) + y, x = sch.get_loops("A") + sch.vectorize(x) + sch.bind(y, "blockIdx.x") + fun = tvm.build(sch.mod, target="cuda") + np_a = np.full((n, lanes), value, dtype="int8") a = tvm.nd.empty((n, lanes), dtype, dev) fun(a) @@ -246,9 +251,13 @@ def check_inf_nan(dev, n, value, dtype): A = te.placeholder((n,), name="A", dtype=dtype) inf_value = tvm.tir.const(value, dtype=dtype) C = te.compute((n,), lambda i: inf_value, name="C") - s = te.create_schedule(C.op) - s[C].bind(s[C].op.axis[0], tx) - fun = tvm.build(s, [A, C], target) + + sch = tvm.tir.Schedule(te.create_prim_func([A, C])) + xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 8]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") + fun = tvm.build(sch.mod, target="cuda") + a = tvm.nd.empty((n,), A.dtype, dev) c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here @@ -264,53 +273,6 @@ def check_inf_nan(dev, n, value, dtype): check_inf_nan(dev, 1, float("nan"), "float64") -@tvm.testing.requires_gpu -@tvm.testing.requires_cuda -def test_cuda_shuffle(): - idxm = tvm.tir.indexmod - a = te.placeholder((64,), "int32") - b = te.placeholder((64,), "int32") - c = te.compute((64,), lambda x: a[x] + b[x - idxm(x, 4) + (3 - idxm(x, 4))]) - sch = te.create_schedule(c.op) - x = c.op.axis[0] - xo, xi = sch[c].split(x, 4) - thrx = te.thread_axis("threadIdx.x") - sch[c].bind(xo, thrx) - sch[c].vectorize(xi) - - def MyVectorize(): - def vectorizer(op): - if op.kind == tvm.tir.ForKind.VECTORIZED: - idx = tvm.tir.Ramp(4 * thrx.var, 1, 4) - store = op.body - value = store.value - new_a = tvm.tir.BufferLoad(value.a.buffer, [idx]) - bs, ids = [], [] - for i in range(4): - bs.append(tvm.tir.BufferLoad(value.b.buffer, [4 * thrx.var + i])) - ids.append(3 - i) - new_b = tvm.tir.Shuffle(bs, ids) - return tvm.tir.BufferStore(store.buffer, new_a + new_b, [idx]) - return None - - def _transform(f, *_): - return f.with_body( - tvm.tir.stmt_functor.ir_transform(f.body, None, vectorizer, ["tir.For"]) - ) - - return tvm.tir.transform.prim_func_pass(_transform, opt_level=0, name="MyVectorize") - - with tvm.transform.PassContext(config={"tir.add_lower_pass": [(1, MyVectorize())]}): - module = tvm.build(sch, [a, b, c], target="cuda") - a_ = np.array(list(range(64)), dtype="int32") - b_ = np.array((list(range(4))[::-1]) * 16, dtype="int32") - c_ = np.zeros((64,), dtype="int32") - ref = a_ + np.array((list(range(4))) * 16, dtype="int32") - nda, ndb, ndc = [tvm.nd.array(i, tvm.cuda(0)) for i in [a_, b_, c_]] - module(nda, ndb, ndc) - tvm.testing.assert_allclose(ndc.numpy(), ref) - - @tvm.testing.parametrize_targets("cuda", "rocm") def test_crossthread_reduction1(target, dev): n = te.var("n") @@ -320,12 +282,13 @@ def test_crossthread_reduction1(target, dev): B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B") def sched(nthd): - s = te.create_schedule(B.op) - ko, _ = s[B].split(B.op.reduce_axis[0], nparts=nthd) - s[B].bind(ko, te.thread_axis("threadIdx.x")) - s[B].bind(B.op.axis[0], te.thread_axis("blockIdx.x")) - func = tvm.build(s, [A, B], target) - return func + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + x, k = sch.get_loops("B") + ko, _ = sch.split(k, factors=[nthd, None]) + sch.bind(ko, "threadIdx.x") + sch.bind(x, "blockIdx.x") + fun = tvm.build(sch.mod, target="cuda") + return fun def verify(nthd): func = sched(nthd) @@ -355,13 +318,14 @@ def test_crossthread_reduction2(target, dev): B = te.compute((n,), lambda i: te.sum(A[i, k0, k1], axis=(k0, k1)), name="B") def sched(nthdx, nthdy): - s = te.create_schedule(B.op) - k0o, _ = s[B].split(B.op.reduce_axis[0], nparts=nthdx) - k1o, _ = s[B].split(B.op.reduce_axis[1], nparts=nthdy) - s[B].bind(k0o, te.thread_axis("threadIdx.x")) - s[B].bind(k1o, te.thread_axis("threadIdx.y")) - s[B].bind(B.op.axis[0], te.thread_axis("blockIdx.x")) - func = tvm.build(s, [A, B], target) + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + x, k0, k1 = sch.get_loops("B") + k0o, _ = sch.split(k0, factors=[nthdx, None]) + k1o, _ = sch.split(k1, factors=[nthdy, None]) + sch.bind(k0o, "threadIdx.x") + sch.bind(k1o, "threadIdx.y") + sch.bind(x, "blockIdx.x") + func = tvm.build(sch.mod, target="cuda") return func def verify(nthdx, nthdy): @@ -389,42 +353,13 @@ def test_cuda_reduction_binding(): k = te.reduce_axis((0, 32), "k") A = te.placeholder((96, 32), name="A") B = te.compute((96,), lambda m: te.sum(A[m, k], axis=k), name="B") - s = te.create_schedule(B.op) - - s[B].reorder(B.op.reduce_axis[0], B.op.axis[0]) - - mo, _ = s[B].split(B.op.axis[0], 32) - s[B].bind(mo, te.thread_axis("blockIdx.x")) - fcuda = tvm.build(s, [A, B], "cuda") - - -@tvm.testing.parametrize_targets("cuda", "rocm") -def test_rfactor_predicates(target, dev): - n = te.reduce_axis((0, 129), "n") - A = te.placeholder((129,), name="A") - B = te.compute((1,), lambda b: te.sum(A[n], axis=n), name="B") - - s = te.create_schedule(B.op) - - _, ni = s[B].split(s[B].op.reduce_axis[0], factor=8) - - BF = s.rfactor(B, ni, 0) - s[B].set_store_predicate(tx.var.equal(0)) - - s[B].bind(s[B].op.reduce_axis[0], tx) - s[B].bind(s[B].op.axis[0], bx) - - s[BF].compute_at(s[B], s[B].op.axis[0]) - - _, noi = s[BF].split(s[BF].op.reduce_axis[0], factor=2) - - BF2 = s.rfactor(BF, noi, 0) - - s[BF].bind(s[BF].op.axis[0], tx) - s[BF2].compute_at(s[BF], s[BF].op.axis[1]) - - fcuda = tvm.build(s, [A, B], target) + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + x, k = sch.get_loops("B") + sch.reorder(k, x) + mo, _ = sch.split(x, factors=[None, 32]) + sch.bind(mo, "blockIdx.x") + func = tvm.build(sch.mod, target="cuda") @tvm.testing.requires_gpu @@ -436,15 +371,14 @@ def test_cuda_const_float_to_half(): shape = (2, 3, 4) a = te.placeholder(shape, dtype="float16", name="a") b = tvm.tir.const(0.5, dtype="float16") - c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name="c") - s = te.create_schedule(c.op) - axes = [axis for axis in c.op.axis] - fused = s[c].fuse(*axes) - bx, tx = s[c].split(fused, factor=64) - s[c].bind(bx, te.thread_axis("blockIdx.x")) - s[c].bind(tx, te.thread_axis("threadIdx.x")) - - func = tvm.build(s, [a, c], "cuda") + c = te.compute(shape, lambda i, j, k: a[i, j, k] > b, name="C") + + sch = tvm.tir.Schedule(te.create_prim_func([a, c])) + xo, xi = sch.split(sch.fuse(*sch.get_loops("C")), factors=[None, 64]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") + func = tvm.build(sch.mod, target="cuda") + dev = tvm.cuda(0) a_np = np.random.uniform(size=shape).astype(a.dtype) c_np = np.zeros(shape=shape, dtype=c.dtype) @@ -463,13 +397,14 @@ def test_cuda_floordiv_with_vectorization(): k = 37 A = te.placeholder((n,), name="A") B = te.compute((n,), lambda i: A[tvm.tir.floordiv(i, k)], name="B") - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], nparts=1) - xio, xii = s[B].split(xi, factor=4) - s[B].vectorize(xii) - s[B].bind(xo, bx) - s[B].bind(xio, tx) - func = tvm.build(s, [A, B], "cuda") + + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + xo, xi = sch.split(sch.get_loops("B")[0], factors=[1, None]) + xio, xii = sch.split(xi, factors=[None, 4]) + sch.vectorize(xii) + sch.bind(xo, "blockIdx.x") + sch.bind(xio, "threadIdx.x") + func = tvm.build(sch.mod, target="cuda") dev = tvm.cuda(0) a_np = np.random.uniform(size=(n,)).astype(A.dtype) @@ -489,13 +424,13 @@ def test_cuda_floormod_with_vectorization(): k = 37 A = te.placeholder((n,), name="A") B = te.compute((n,), lambda i: A[tvm.tir.floormod(i, k)], name="B") - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], nparts=1) - xio, xii = s[B].split(xi, factor=4) - s[B].vectorize(xii) - s[B].bind(xo, bx) - s[B].bind(xio, tx) - func = tvm.build(s, [A, B], "cuda") + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + xo, xi = sch.split(sch.get_loops("B")[0], factors=[1, None]) + xio, xii = sch.split(xi, factors=[None, 4]) + sch.vectorize(xii) + sch.bind(xo, "blockIdx.x") + sch.bind(xio, "threadIdx.x") + func = tvm.build(sch.mod, target="cuda") dev = tvm.cuda(0) a_np = np.random.uniform(size=(n,)).astype(A.dtype) @@ -521,11 +456,11 @@ def check(t0, t1, factor): C = te.compute((n,), lambda i: A[i] + topi.cast(B[i], A.dtype), name="C") # schedule - s = tvm.te.create_schedule(C.op) - ob, ib = s[C].split(s[C].op.axis[0], factor=factor) - s[C].vectorize(ib) - s[C].bind(ob, tx) - func = tvm.build(s, [A, B, C], "cuda") + sch = tvm.tir.Schedule(te.create_prim_func([A, B, C])) + ob, ib = sch.split(sch.get_loops("C")[0], factors=[None, factor]) + sch.vectorize(ib) + sch.bind(ob, "threadIdx.x") + func = tvm.build(sch.mod, target="cuda") # correctness dev = tvm.cuda(0) @@ -570,15 +505,16 @@ def skip(t0, t1): check("uint8", "int8", 16) -def sched(B): - s = te.create_schedule(B.op) - io, ii = s[B].split(s[B].op.axis[0], nparts=1) - iio, iii = s[B].split(ii, nparts=32) - _, iiii = s[B].split(iii, factor=4) - s[B].vectorize(iiii) - s[B].bind(io, bx) - s[B].bind(iio, tx) - return s +def sched(A, B): + # schedule + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + io, ii = sch.split(sch.get_loops("B")[0], factors=[1, None]) + iio, iii = sch.split(ii, factors=[32, None]) + _, iiii = sch.split(iii, factors=[None, 4]) + sch.vectorize(iiii) + sch.bind(io, "blockIdx.x") + sch.bind(iio, "threadIdx.x") + return tvm.build(sch.mod, target="cuda") @tvm.testing.requires_gpu @@ -627,8 +563,7 @@ def run_test(tvm_intrin, np_func, dtype): n = 128 A = te.placeholder((n,), dtype=dtype, name="A") B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name="B") - s = sched(B) - f = tvm.build(s, [A, B], "cuda") + f = sched(A, B) dev = tvm.cuda(0) a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev) @@ -653,8 +588,7 @@ def run_test(tvm_intrin, np_func): n = 128 A = te.placeholder((n,), dtype=dtype, name="A") B = te.compute((n,), lambda i: tvm_intrin(A[i], c2), name="B") - s = sched(B) - f = tvm.build(s, [A, B], "cuda") + f = sched(A, B) dev = tvm.cuda(0) a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev) @@ -679,8 +613,7 @@ def run_test(dtype): n = 128 A = te.placeholder((n,), dtype=dtype, name="A") B = te.compute((n,), lambda i: tvm.tir.popcount(A[i]), name="B") - s = sched(B) - f = tvm.build(s, [A, B], "cuda") + f = sched(A, B) dev = tvm.cuda(0) a = tvm.nd.array(np.random.randint(0, 100000, size=n).astype(A.dtype), dev) b = tvm.nd.array(np.zeros(shape=(n,)).astype(B.dtype), dev) @@ -711,12 +644,14 @@ def check_cuda(dtype, n, l, padding, lanes): ), name="B", ) - s = te.create_schedule(B.op) - block, thread, vectorize = s[B].op.axis - s[B].bind(block, bx) - s[B].bind(thread, tx) - s[B].vectorize(vectorize) - fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad") + + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + block, thread, vectorize = sch.get_loops("B") + sch.bind(block, "blockIdx.x") + sch.bind(thread, "threadIdx.x") + sch.vectorize(vectorize) + fun = tvm.build(sch.mod, target="cuda") + np_a = np.random.randint(low=-128, high=127, size=(n, l)).astype(A.dtype) a = tvm.nd.empty((n, l), A.dtype, dev).copyfrom(np_a) b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev) @@ -736,205 +671,6 @@ def check_cuda(dtype, n, l, padding, lanes): check_cuda("float32", 64, 16, 3, 4) -def vcf_check_common(s, args): - N = 512 - - # To check if every vectorize loop transforms to ramp expr successfully - stmt = tvm.lower(s, args) - # Use this as a stack flag to show whether this stmt is inside a BroadcastNode - inside_broadcast = [False] - - # Possible patterns: - # Reduce init: BufferStore[Ramp] = Broadcast(0) - # Shared memory copy: BufferStore[Ramp] = BufferLoad[Ramp] - # Compute: BufferStore[Ramp] = BufferLoad[Ramp] ... Broadcast[Load] - - def pre_visit(stmt): - if isinstance(stmt, tvm.tir.Broadcast): - inside_broadcast[0] = True - # Check Broadcast[Imm numbers] or Broadcast[Load] patterns - assert isinstance(stmt.value, (tvm.tir.IntImm, tvm.tir.FloatImm, tvm.tir.BufferLoad)) - - if isinstance(stmt, (tvm.tir.BufferStore, tvm.tir.BufferLoad)): - is_ramp_index = isinstance(stmt.indices[-1], tvm.tir.Ramp) - is_vectorized_buffer = re.match(r"^.*x\d+$", stmt.buffer.dtype) - if isinstance(stmt, tvm.tir.BufferLoad): - # Check Broadcast[BufferLoad] or BufferLoad[Ramp] patterns - assert inside_broadcast[0] or is_ramp_index or is_vectorized_buffer - # Skip the rest of the BufferLoad - return stmt - else: - assert is_ramp_index or is_vectorized_buffer - - return None - - def post_visit(stmt): - if isinstance(stmt, tvm.tir.Broadcast): - inside_broadcast[0] = False - return None - - tvm.tir.stmt_functor.ir_transform(stmt["main"].body, pre_visit, post_visit) - - tgt = tvm.target.cuda() - mod = tvm.build(s, args, tgt) - # To check if every vectorize loop transforms to correct instruction - # print(mod.imported_modules[0].get_source()) - - dev = tvm.device("cuda", 0) - a = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), dev) - b = tvm.nd.array(np.random.uniform(size=(512, 512)).astype("float32"), dev) - c = tvm.nd.array(np.zeros((512, 512), dtype="float32"), dev) - mod(a, b, c) - tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-5) - - -@tvm.testing.requires_gpu -@tvm.testing.requires_cuda -def test_vectorized_cooperative_fetching_x(): - N = 512 - A = te.placeholder((N, N), name="A", dtype="float32") - B = te.placeholder((N, N), name="B", dtype="float32") - k = te.reduce_axis((0, N), name="k") - C = te.compute((N, N), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k)) - s = te.create_schedule(C.op) - i, j = s[C].op.axis - k = s[C].op.reduce_axis[0] - - AA = s.cache_read(A, "shared", [C]) - BB = s.cache_read(B, "shared", [C]) - - i3, i4 = s[C].split(i, factor=4) - i2, i3 = s[C].split(i3, factor=2) - i1, i2 = s[C].split(i2, factor=8) - i0, i1 = s[C].split(i1, factor=1) - j3, j4 = s[C].split(j, factor=4) - j2, j3 = s[C].split(j3, factor=2) - j1, j2 = s[C].split(j2, factor=8) - j0, j1 = s[C].split(j1, factor=2) - k1, k2 = s[C].split(k, factor=8) - k0, k1 = s[C].split(k1, factor=8) - s[C].reorder(i0, j0, i1, j1, i2, j2, k0, k1, i3, j3, k2, i4, j4) - block_it = s[C].fuse(i0, j0) - s[C].bind(block_it, tvm.te.thread_axis("blockIdx.x")) - vthread_it = s[C].fuse(i1, j1) - s[C].bind(vthread_it, tvm.te.thread_axis("vthread")) - thread_it = s[C].fuse(i2, j2) - s[C].bind(thread_it, tvm.te.thread_axis("threadIdx.x")) - s[C].vectorize(j4) - - s[AA].compute_at(s[C], k0) - iaa, jaa = s[AA].op.axis - s[BB].compute_at(s[C], k0) - ibb, jbb = s[BB].op.axis - aa_fused = s[AA].fuse(iaa, jaa) - bb_fused = s[BB].fuse(ibb, jbb) - aa1, aa2 = s[AA].split(aa_fused, factor=4) - aa0, aa1 = s[AA].split(aa1, factor=64) - bb1, bb2 = s[BB].split(bb_fused, factor=4) - bb0, bb1 = s[BB].split(bb1, factor=64) - s[AA].bind(aa1, tvm.te.thread_axis("threadIdx.x")) - s[AA].vectorize(aa2) - s[BB].bind(bb1, tvm.te.thread_axis("threadIdx.x")) - s[BB].vectorize(bb2) - - vcf_check_common(s, [A, B, C]) - - -@tvm.testing.requires_gpu -@tvm.testing.requires_cuda -def test_vectorized_cooperative_fetching_xy(): - N = 512 - A = te.placeholder((N, N), name="A") - B = te.placeholder((N, N), name="B") - k = te.reduce_axis((0, N), name="k") - C = te.compute((N, N), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k)) - s = te.create_schedule(C.op) - i, j = s[C].op.axis - k = s[C].op.reduce_axis[0] - - AA = s.cache_read(A, "shared", [C]) - BB = s.cache_read(B, "shared", [C]) - - i3, i4 = s[C].split(i, factor=4) - i2, i3 = s[C].split(i3, factor=2) - i1, i2 = s[C].split(i2, factor=8) - i0, i1 = s[C].split(i1, factor=1) - j3, j4 = s[C].split(j, factor=4) - j2, j3 = s[C].split(j3, factor=2) - j1, j2 = s[C].split(j2, factor=8) - j0, j1 = s[C].split(j1, factor=2) - k1, k2 = s[C].split(k, factor=8) - k0, k1 = s[C].split(k1, factor=8) - s[C].reorder(i0, j0, i1, j1, i2, j2, k0, k1, i3, j3, k2, i4, j4) - block_it = s[C].fuse(i0, j0) - s[C].bind(block_it, tvm.te.thread_axis("blockIdx.x")) - vthread_it = s[C].fuse(i1, j1) - s[C].bind(vthread_it, tvm.te.thread_axis("vthread")) - s[C].bind(i2, tvm.te.thread_axis("threadIdx.y")) - s[C].bind(j2, tvm.te.thread_axis("threadIdx.x")) - s[C].vectorize(j4) - - s[AA].compute_at(s[C], k0) - iaa, jaa = s[AA].op.axis - s[BB].compute_at(s[C], k0) - ibb, jbb = s[BB].op.axis - aa_fused = s[AA].fuse(iaa, jaa) - bb_fused = s[BB].fuse(ibb, jbb) - aa2, aa3 = s[AA].split(aa_fused, factor=4) - aa1, aa2 = s[AA].split(aa2, factor=8) - aa0, aa1 = s[AA].split(aa1, factor=8) - bb2, bb3 = s[BB].split(bb_fused, factor=4) - bb1, bb2 = s[BB].split(bb2, factor=8) - bb0, bb1 = s[BB].split(bb1, factor=8) - s[AA].bind(aa1, tvm.te.thread_axis("threadIdx.y")) - s[AA].bind(aa2, tvm.te.thread_axis("threadIdx.x")) - s[AA].vectorize(aa3) - s[BB].bind(bb1, tvm.te.thread_axis("threadIdx.y")) - s[BB].bind(bb2, tvm.te.thread_axis("threadIdx.x")) - s[BB].vectorize(bb3) - - vcf_check_common(s, [A, B, C]) - - -@tvm.testing.requires_gpu -@tvm.testing.requires_cuda -def test_unrolled_vectorization(): - dtype = "float32" - target = "cuda" - - # Compute declaration - N = 128 - A = te.placeholder((N, N), name="A") - B = te.placeholder((N, N), name="B") - k = te.reduce_axis((0, N), name="k") - C = te.compute((N, N), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C") - - # Schedule - s = te.create_schedule([C.op]) - CC = s.cache_write(C, "local") - i, j = s[C].op.axis - bx, tx, ii, ji = s[C].tile(i, j, 1, 2) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - s[C].vectorize(ji) - s[CC].compute_at(s[C], tx) - i, j = s[CC].op.axis - k = s[CC].op.reduce_axis[0] - ko, ki = s[CC].split(k, 2) - s[CC].unroll(ki) - s[CC].vectorize(j) - - # Check correctness - dev = tvm.device(target) - a_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), device=dev) - b_tvm = tvm.nd.array(np.ones((N, N)).astype(dtype), device=dev) - c_tvm = tvm.nd.empty((N, N), device=dev) - func_tvm = tvm.build(s, [A, B, C], target=target) - func_tvm(a_tvm, b_tvm, c_tvm) - c_np = c_tvm.numpy() - tvm.testing.assert_allclose(c_np, N * np.ones((N, N))) - - @tvm.testing.requires_gpu @tvm.testing.requires_cuda def test_try_unaligned_vector_load(): @@ -950,16 +686,15 @@ def get_compute_aligned(): return get_compute(4, 2, 2) def build(A, C, N, C_N): - s = te.create_schedule(C.op) - oi, ii = s[C].split(C.op.axis[0], factor=2) - s[C].bind(oi, te.thread_axis("threadIdx.x")) - s[C].vectorize(ii) # BUG: misalignment - - tgt = tvm.target.Target(target="cuda", host="llvm") - dev = tvm.device(tgt.kind.name, 0) - f = tvm.build(s, [A, C], tgt, name="foo") - kernel_source = f.imported_modules[0].get_source() + sch = tvm.tir.Schedule(te.create_prim_func([A, C])) + oi, ii = sch.split(sch.get_loops("C")[0], factors=[None, 2]) + sch.bind(oi, "threadIdx.x") + sch.vectorize(ii) # BUG: misalignment + f = tvm.build(sch.mod, target="cuda") + + kernel_source = f.imported_modules[0].get_source() + dev = tvm.cuda() a_data = np.arange(0, N).astype(A.dtype) a = tvm.nd.array(a_data, dev) c = tvm.nd.array(np.zeros(C_N, dtype=C.dtype), dev) @@ -984,28 +719,6 @@ def build(A, C, N, C_N): assert np.allclose(c, expected), f"expected={expected}\nactual={c}" -@tvm.testing.requires_gpu -@tvm.testing.requires_cuda -def test_cuda_save_kernels_for_profiling(): - num_thread = 8 - - def check_cuda(n, lanes): - dtype = "float32" - A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes)) - B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B") - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(xo, bx) - s[B].bind(xi, tx) - tempdir = utils.tempdir() - tmp_path = str(tempdir.path) - with tvm.transform.PassContext(opt_level=3, config={"cuda.kernels_output_dir": tmp_path}): - _ = tvm.build(s, [A, B], "cuda") - assert "tvm_kernels.cu" in os.listdir(tmp_path) - - check_cuda(64, 2) - - @tvm.testing.requires_gpu @tvm.testing.requires_cuda def test_cuda_thread_sync_inside_condition(): diff --git a/tests/python/codegen/test_target_codegen_device.py b/tests/python/codegen/test_target_codegen_device.py index b4181fb7b014..ad27356961aa 100644 --- a/tests/python/codegen/test_target_codegen_device.py +++ b/tests/python/codegen/test_target_codegen_device.py @@ -19,6 +19,7 @@ from tvm.contrib import utils import numpy as np import tvm.testing +from tvm import tir @tvm.testing.requires_gpu @@ -29,16 +30,25 @@ def test_large_uint_imm(): num_thread = 2 A = te.compute((n,), lambda *i: tvm.tir.const(value, "uint64") + other, name="A") - s = te.create_schedule(A.op) - xo, xi = s[A].split(A.op.axis[0], factor=num_thread) - s[A].bind(xi, te.thread_axis("threadIdx.x")) - s[A].bind(xo, te.thread_axis("blockIdx.x")) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A]) + sch = tir.Schedule(mod) + + # Get block and loop + block = sch.get_block("A") + loop = sch.get_loops(block)[0] + + # Split and bind + xo, xi = sch.split(loop, factors=[None, num_thread]) + sch.bind(xi, "threadIdx.x") + sch.bind(xo, "blockIdx.x") def check_target(device): if not tvm.testing.device_enabled(device): return dev = tvm.device(device, 0) - f = tvm.build(s, [A], device) + f = tvm.build(sch.mod, target=device) # launch the kernel. a = tvm.nd.empty((n,), dtype=A.dtype, device=dev) f(a) @@ -55,23 +65,36 @@ def test_add_pipeline(): B = te.placeholder((), name="B") C = te.compute(A.shape, lambda *i: A(*i) + B(), name="C") D = te.compute(A.shape, lambda *i: C(*i) + 1, name="D") - s = te.create_schedule(D.op) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B, D]) + sch = tir.Schedule(mod) + + # Get blocks and loops + c_block = sch.get_block("C") + d_block = sch.get_block("D") + c_loop = sch.get_loops(c_block)[0] + d_loop = sch.get_loops(d_block)[0] # GPU schedule have to split by gridIdx and threadIdx num_thread = 256 - xo, xi = s[C].split(C.op.axis[0], factor=num_thread) - s[C].bind(xi, te.thread_axis("threadIdx.x")) - s[C].bind(xo, te.thread_axis("blockIdx.x")) - xo, xi = s[D].split(D.op.axis[0], factor=num_thread) - s[D].bind(xi, te.thread_axis("threadIdx.x")) - s[D].bind(xo, te.thread_axis("blockIdx.x")) + # Schedule C + c_xo, c_xi = sch.split(c_loop, factors=[None, num_thread]) + sch.bind(c_xi, "threadIdx.x") + sch.bind(c_xo, "blockIdx.x") + + # Schedule D + d_xo, d_xi = sch.split(d_loop, factors=[None, num_thread]) + sch.bind(d_xi, "threadIdx.x") + sch.bind(d_xo, "blockIdx.x") def check_target(device, host="stackvm"): if not tvm.testing.device_enabled(device) or not tvm.testing.device_enabled(host): return dev = tvm.device(device, 0) - mhost = tvm.driver.build(s, [A, B, D], target=tvm.target.Target(device, host)) + target = tvm.target.Target(device, host) + mhost = tvm.build(sch.mod, target=target) f = mhost.entry_func # launch the kernel. n = 1027 diff --git a/tests/python/codegen/test_target_codegen_extern.py b/tests/python/codegen/test_target_codegen_extern.py index 38fac332e9de..378eb427fd54 100644 --- a/tests/python/codegen/test_target_codegen_extern.py +++ b/tests/python/codegen/test_target_codegen_extern.py @@ -18,6 +18,8 @@ from tvm import te import numpy as np import tvm.testing +import pytest +from tvm import tir @tvm.testing.uses_gpu @@ -56,18 +58,18 @@ def extern_generator_gpu(ins, outs): C_cpu = te.extern(A.shape, [A], extern_generator, name="C") C_gpu = te.extern(A.shape, [A], extern_generator_gpu, name="C") - s_cpu = te.create_schedule(C_cpu.op) - s_gpu = te.create_schedule(C_gpu.op) - print(tvm.lower(s_cpu, [A, C_cpu], simple_mode=True)) - print(tvm.lower(s_gpu, [A, C_gpu], simple_mode=True)) + + # Create IRModules directly + mod_cpu = tvm.IRModule.from_expr(te.create_prim_func([A, C_cpu])) + mod_gpu = tvm.IRModule.from_expr(te.create_prim_func([A, C_gpu])) def check_target(target): if not tvm.testing.device_enabled(target): return - s = s_gpu if target in ["opencl", "cuda"] else s_cpu + mod = mod_gpu if target in ["opencl", "cuda"] else mod_cpu C = C_gpu if target in ["opencl", "cuda"] else C_cpu # build and invoke the kernel. - f = tvm.build(s, [A, C], target) + f = tvm.build(mod, target=target) dev = tvm.device(target, 0) # launch the kernel. n = nn @@ -91,7 +93,9 @@ def extern_generator(ins, outs): return tvm.tir.call_packed("my_extern_array_func1", ins[0], outs[0]) C = te.extern(A.shape, [A], extern_generator, name="C") - s = te.create_schedule(C.op) + + # Create IRModule directly + mod = tvm.IRModule.from_expr(te.create_prim_func([A, C])) @tvm.register_func def my_extern_array_func1(aa, bb): @@ -101,7 +105,7 @@ def check_target(target): if not tvm.testing.device_enabled(target): return # build and invoke the kernel. - f = tvm.build(s, [A, C], target) + f = tvm.build(mod, target=target) dev = tvm.cpu(0) # launch the kernel. n = nn @@ -115,6 +119,7 @@ def check_target(target): check_target("llvm") +@pytest.mark.skip("LEGACY-TO-FIX: limitation of create_prim_func with intermediate buffer") def test_pack_buffer_intermediate(): nn = 1024 n = tvm.runtime.convert(nn) @@ -126,13 +131,13 @@ def extern_generator(ins, outs): return tvm.tir.call_packed("my_extern_array_func2", ins[0], outs[0]) C = te.extern(B.shape, [B], extern_generator, name="C") - s = te.create_schedule(C.op) + mod = tvm.IRModule.from_expr(te.create_prim_func([A, C])) def check_target(target): if not tvm.testing.device_enabled(target): return # build and invoke the kernel. - f = tvm.build(s, [A, C], target) + f = tvm.build(mod, target=target) dev = tvm.cpu(0) # launch the kernel. n = nn diff --git a/tests/python/codegen/test_target_codegen_hexagon.py b/tests/python/codegen/test_target_codegen_hexagon.py index c97637f927b7..37e62e5b34ef 100644 --- a/tests/python/codegen/test_target_codegen_hexagon.py +++ b/tests/python/codegen/test_target_codegen_hexagon.py @@ -15,14 +15,15 @@ # specific language governing permissions and limitations # under the License. -import numpy as np import os -import pytest import re import sys +import numpy as np +import pytest import tvm import tvm.testing import tvm.contrib.hexagon as hexagon +from tvm import te @pytest.fixture(autouse=True) @@ -39,28 +40,17 @@ def register_linker(): def test_basic(): target = tvm.target.hexagon("v66", hvx=128) - def check_add(offload): + def check_add(): A = tvm.te.placeholder((128,), dtype="uint8", name="A") B = tvm.te.placeholder((128,), dtype="uint8", name="A") C = tvm.te.compute((128,), lambda i: A[i] + B[i], name="C") - s = tvm.te.create_schedule(C.op) - - if offload: - xo, xi = s[C].split(s[C].op.axis[0], nparts=1) - s[C].bind(xo, tvm.te.thread_axis("pipeline")) - m = tvm.build(s, [C, A, B], target=target, name="offload_add") - hexm = m.imported_modules[0] - else: - hexm = tvm.build( - s, [C, A, B], target=tvm.target.Target(target, target), name="native_add" - ) - + mod = tvm.IRModule.from_expr(te.create_prim_func([C, A, B])) + hexm = tvm.build(mod, target=tvm.target.Target(target, target)) asm = hexm.get_source("s") vadds = re.findall(r"v[0-9]+.b = vadd\(v[0-9]+.b,v[0-9]+.b\)", asm) assert vadds # Check that it's non-empty - check_add(True) - check_add(False) + check_add() @tvm.testing.requires_hexagon @@ -69,48 +59,22 @@ def test_llvm_target_features(): # Define some trivial compute A = tvm.te.placeholder((128,), dtype="uint8", name="A") C = tvm.te.compute((128,), lambda i: A[i] + 1, name="C") - s = tvm.te.create_schedule(C.op) - m = tvm.build(s, [C, A], target=tvm.target.Target(target, target), name="add_one") + mod = tvm.IRModule.from_expr(te.create_prim_func([C, A]).with_attr("global_symbol", "add_one")) + m = tvm.build(mod, target=tvm.target.Target(target, target)) llvm_ir = m.get_source("ll") # Make sure we find +hvx-length128b in "attributes". fs = re.findall(r"attributes.*\+hvx-length128b", llvm_ir) assert fs # Check that it's non-empty -@tvm.testing.requires_hexagon -def test_alloc_vtcm(): - target = tvm.target.hexagon("v66") - - buf_len = 2048 - A = tvm.te.placeholder((buf_len,), name="A", dtype="int8") - B = tvm.te.placeholder((buf_len,), name="B", dtype="int8") - - A_buf = tvm.te.compute((buf_len,), lambda *i: A(*i), "A_buf") - B_buf = tvm.te.compute((buf_len,), lambda *i: B(*i), "B_buf") - C = tvm.te.compute((buf_len,), lambda *i: A_buf(*i) + B_buf(*i), name="C") - s = tvm.te.create_schedule(C.op) - - # Use VTCM for each buffer. - s[A_buf].set_scope("local.vtcm") - s[B_buf].set_scope("local.vtcm") - - config = {"tir.add_lower_pass": hexagon.ir_lower_vtcm_pass()} - with tvm.transform.PassContext(config=config): - irmod = tvm.lower(s, [A, B, C], name="alloc_vtcm") - - calls = re.findall("HexagonBackend[A-Za-z]*VTCM", str(irmod["alloc_vtcm"])) - assert "HexagonBackendAllocateVTCM" in calls - assert "HexagonBackendFreeVTCM" in calls - - @tvm.testing.requires_hexagon def test_llvm_options(): target = tvm.target.hexagon("v66", llvm_options="-hexagon-noopt") Zero = tvm.te.compute((10,), lambda _: tvm.tir.const(0, "int32")) - s = tvm.te.create_schedule(Zero.op) - tvm.build(s, [Zero], target=target, name="zero") + mod = tvm.IRModule.from_expr(te.create_prim_func([Zero])) # Check that BuildHexagon hasn't crashed because of target attribute # type mismatch. + tvm.build(mod, target=tvm.target.Target(target, target)) assert re.search("-hexagon-noopt", str(target)) diff --git a/tests/python/codegen/test_target_codegen_llvm.py b/tests/python/codegen/test_target_codegen_llvm.py index d629d93d365e..e3ccff49ba1b 100644 --- a/tests/python/codegen/test_target_codegen_llvm.py +++ b/tests/python/codegen/test_target_codegen_llvm.py @@ -26,6 +26,7 @@ import tvm import tvm.testing from tvm import te +from tvm import tir from tvm.contrib import clang, utils from tvm.script import tir as T, ir as I from tvm.target.codegen import llvm_get_intrinsic_name, llvm_lookup_intrinsic_id @@ -85,8 +86,13 @@ def use_llvm_intrinsic(A, C): C = tvm.te.extern( (1, 1), [A], lambda ins, outs: use_llvm_intrinsic(ins[0], outs[0]), name="C", dtype="int32" ) - s = tvm.te.create_schedule(C.op) - f = tvm.build(s, [A, C], target="llvm") + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, C]) + sch = tir.Schedule(mod) + + # Build from scheduled TIR + f = tvm.build(sch.mod, target="llvm") @tvm.testing.requires_llvm @@ -108,10 +114,13 @@ def test_llvm_large_uintimm(): value = (1 << 63) + 123 other = tvm.tir.const(3, "uint64") A = te.compute((), lambda: tvm.tir.const(value, "uint64") + other, name="A") - s = te.create_schedule(A.op) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A]) + sch = tir.Schedule(mod) def check_llvm(): - f = tvm.build(s, [A], "llvm") + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. a = tvm.nd.empty((), dtype=A.dtype, device=dev) @@ -122,24 +131,38 @@ def check_llvm(): @tvm.testing.requires_llvm -def test_llvm_persist_parallel(): +def test_llvm_multi_parallel(): n = 128 A = te.placeholder((n,), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1, name="B") C = te.compute(A.shape, lambda *i: te.sqrt(B(*i)) * 2 + 2, name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=8) - xo1, xo2 = s[C].split(xo, nparts=1) - s[B].compute_at(s[C], xo1) - s[B].parallel(s[B].op.axis[0]) - s[B].pragma(s[B].op.axis[0], "parallel_barrier_when_finish") - s[C].parallel(xi) - s[C].pragma(xo1, "parallel_launch_point") - s[C].pragma(xi, "parallel_stride_pattern") + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, C]) + sch = tir.Schedule(mod) + + # Get blocks and loops + c_block = sch.get_block("C") + b_block = sch.get_block("B") + c_loop = sch.get_loops(c_block)[0] + + # Split and parallelize + xo, xi = sch.split(c_loop, factors=[None, 8]) + xo1, xo2 = sch.split(xo, factors=[1, None]) + + # Move computation of B + sch.compute_at(b_block, xo1) + + # Get B's loop after compute_at + b_loop = sch.get_loops(b_block)[0] + + # Apply parallel scheduling + sch.parallel(b_loop) + sch.parallel(xi) def check_llvm(): # BUILD and invoke the kernel. - f = tvm.build(s, [A, C], "llvm") + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) @@ -156,12 +179,22 @@ def check_llvm(nn, base): n = tvm.runtime.convert(nn) A = te.placeholder((n + base), name="A") C = te.compute((n,), lambda i: A(nn + base - i - 1), name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=4) - s[C].parallel(xo) - s[C].vectorize(xi) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, C]) + sch = tir.Schedule(mod) + + # Get block and loop + block = sch.get_block("C") + loop = sch.get_loops(block)[0] + + # Split and parallelize + xo, xi = sch.split(loop, factors=[None, 4]) + sch.parallel(xo) + sch.vectorize(xi) + # build and invoke the kernel. - f = tvm.build(s, [A, C], "llvm") + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. n = nn @@ -178,29 +211,31 @@ def check_llvm(nn, base): @tvm.testing.requires_llvm def test_llvm_vadd_pipeline(): - def check_llvm(n, lanes): - A = te.placeholder((n,), name="A", dtype="float32x%d" % lanes) - B = te.compute((n,), lambda i: A[i], name="B") - C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], nparts=2) - _, xi = s[C].split(xi, factor=2) - s[C].parallel(xo) - s[C].vectorize(xi) - s[B].compute_at(s[C], xo) - xo, xi = s[B].split(B.op.axis[0], factor=2) - s[B].vectorize(xi) - # build and invoke the kernel. - f = tvm.build(s, [A, C], "llvm") - dev = tvm.cpu(0) - # launch the kernel. - a = tvm.nd.empty((n,), A.dtype).copyfrom(np.random.uniform(size=(n, lanes))) - c = tvm.nd.empty((n,), C.dtype, dev) - f(a, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1) - - check_llvm(64, 2) - check_llvm(512, 2) + n = te.size_var("n") + A = te.placeholder((n,), name="A") + B = te.placeholder((n,), name="B") + C = te.compute((n,), lambda i: A[i] + B[i], name="C") + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B, C]) + sch = tir.Schedule(mod) + + # Get block and loop + block = sch.get_block("C") + loop = sch.get_loops(block)[0] + + # Split the loop + _, inner = sch.split(loop, factors=[None, 4]) + sch.vectorize(inner) + # Build and verify + f = tvm.build(sch.mod, target="llvm") + dev = tvm.cpu(0) + n = 128 + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) + f(a, b, c) + tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) @tvm.testing.requires_llvm @@ -209,12 +244,22 @@ def check_llvm(nn, base, stride): n = tvm.runtime.convert(nn) A = te.placeholder((n + base, stride), name="A") C = te.compute((n, stride), lambda i, j: A(base + i, j) + 1, name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=4) - s[C].parallel(xo) - s[C].vectorize(xi) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, C]) + sch = tir.Schedule(mod) + + # Get block and loops + block = sch.get_block("C") + i_loop, j_loop = sch.get_loops(block) + + # Split and parallelize + xo, xi = sch.split(i_loop, factors=[None, 4]) + sch.parallel(xo) + sch.vectorize(xi) + # build and invoke the kernel. - f = tvm.build(s, [A, C], "llvm") + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. n = nn @@ -237,11 +282,14 @@ def test_llvm_temp_space(): A = te.placeholder((n,), name="A") B = te.compute(A.shape, lambda i: A(i) + 1, name="B") C = te.compute(A.shape, lambda i: B(i) + 1, name="C") - s = te.create_schedule(C.op) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, C]) + sch = tir.Schedule(mod) def check_llvm(): # build and invoke the kernel. - f = tvm.build(s, [A, C], "llvm") + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. n = nn @@ -255,36 +303,37 @@ def check_llvm(): @tvm.testing.requires_llvm def test_multiple_func(): - nn = 1024 - n = tvm.runtime.convert(nn) + # Define the computation + n = te.size_var("n") A = te.placeholder((n,), name="A") B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=4) - s[C].parallel(xo) - s[C].vectorize(xi) - - def check_llvm(): - # build two functions - f2 = tvm.lower(s, [A, B, C], name="fadd1") - f1 = tvm.lower(s, [A, B, C], name="fadd2") - m = tvm.build([f1, f2], "llvm") - fadd2 = m["fadd2"] - fadd1 = m["fadd1"] + C = te.compute((n,), lambda i: A[i] + B[i], name="C") + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B, C]) + sch = tir.Schedule(mod) + + # Create two functions with different names + mod = tvm.IRModule( + { + "fadd1": sch.mod["main"].with_attr("global_symbol", "fadd1"), + "fadd2": sch.mod["main"].with_attr("global_symbol", "fadd2"), + } + ) - dev = tvm.cpu(0) - # launch the kernel. - n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - fadd1(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - fadd2(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) + # Build and verify + f = tvm.build(mod, target="llvm") + dev = tvm.cpu(0) + n = 10 + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - check_llvm() + # Test both functions + f["fadd1"](a, b, c) + tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) + f["fadd2"](a, b, c) + tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) @tvm.testing.requires_llvm @@ -292,9 +341,13 @@ def test_llvm_condition(): def check_llvm(n, offset): A = te.placeholder((n,), name="A") C = te.compute((n,), lambda i: tvm.tir.if_then_else(i >= offset, A[i], 0.0), name="C") - s = te.create_schedule(C.op) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, C]) + sch = tir.Schedule(mod) + # build and invoke the kernel. - f = tvm.build(s, [A, C], "llvm") + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) @@ -312,9 +365,13 @@ def test_llvm_bool(): def check_llvm(n): A = te.placeholder((n,), name="A", dtype="int32") C = te.compute((n,), lambda i: A[i].equal(1).astype("float"), name="C") - s = te.create_schedule(C.op) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, C]) + sch = tir.Schedule(mod) + # build and invoke the kernel. - f = tvm.build(s, [A, C], "llvm") + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev) @@ -334,9 +391,13 @@ def check_llvm(n): k = te.reduce_axis((0, n), name="k") C = te.compute((), lambda: te.sum(A[k] * scale(), axis=k), name="C") D = te.compute((), lambda: C() + 1) - s = te.create_schedule(D.op) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, scale, D]) + sch = tir.Schedule(mod) + # build and invoke the kernel. - f = tvm.build(s, [A, scale, D], "llvm") + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev) @@ -358,9 +419,13 @@ def check_llvm(n): k = te.reduce_axis((0, n), name="k") C = te.compute((), lambda: te.sum(A[k] * scale(), axis=k), name="C") D = te.compute((), lambda: C() + 1) - s = te.create_schedule(D.op) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, scale, D]) + sch = tir.Schedule(mod) + # build and invoke the kernel. - f = tvm.build(s, [A, scale, D], "llvm") + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev) @@ -378,10 +443,21 @@ def test_alignment(): n = tvm.runtime.convert(1024) A = te.placeholder((n,), name="A") B = te.compute(A.shape, lambda i: A[i] * 3, name="B") - s = te.create_schedule(B.op) - bx, tx = s[B].split(B.op.axis[0], factor=8) - s[B].vectorize(tx) - f = tvm.build(s, [A, B], "llvm", name="test_alignment") + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B]).with_attr("global_symbol", "test_alignment") + sch = tir.Schedule(mod) + + # Get block and loop + block = sch.get_block("B") + loop = sch.get_loops(block)[0] + + # Split and vectorize + _, tx = sch.split(loop, factors=[None, 8]) + sch.vectorize(tx) + + # Build with name + f = tvm.build(sch.mod, target="llvm") lines = f.get_source().split("\n") @@ -452,8 +528,12 @@ def clipb(x): lambda i, j: (div(clipa(A[i]), clipb(B[j])), mod(clipa(A[i]), clipb(B[j]))), ) - s = te.create_schedule([D.op, M.op]) - f = tvm.build(s, [A, B, D, M], "llvm") + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B, D, M]) + sch = tir.Schedule(mod) + + # Build from scheduled TIR + f = tvm.build(sch.mod, target="llvm") # Fill input arrays with values A_arr = tvm.nd.empty((end - start + 1,), dtype) @@ -477,7 +557,7 @@ def _show_info(): print("dtype: {}".format(dtype)) print("dividend range: [{}, {}]".format(start, end)) print("divisor range: [{}, {}]".format(dstart, dend)) - lowered = tvm.lower(s, [A, B, D, M], simple_mode=True) + lowered = tvm.lower(sch.mod, simple_mode=True) print("Lowered code:") print(lowered) @@ -557,8 +637,12 @@ def check_llvm_reciprocal(n): A = te.placeholder((n,), name="A") B = te.compute((n,), lambda i: te.div(1.0, (1e37 * A[i])), name="B") - s = te.create_schedule(B.op) - f = tvm.build(s, [A, B], "llvm") + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B]) + sch = tir.Schedule(mod) + + # Build from scheduled TIR + f = tvm.build(sch.mod, target="llvm") a = tvm.nd.array(np.full((n,), 100, "float32")) b = tvm.nd.empty((n,), "float32") @@ -573,8 +657,12 @@ def check_llvm_sigmoid(n): A = te.placeholder((n,), name="A") B = te.compute((n,), lambda i: te.sigmoid(A[i]), name="B") - s = te.create_schedule(B.op) - f = tvm.build(s, [A, B], "llvm") + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B]) + sch = tir.Schedule(mod) + + # Build from scheduled TIR + f = tvm.build(sch.mod, target="llvm") a = tvm.nd.array(np.full((n,), -1000, "float32")) b = tvm.nd.empty((n,), "float32") @@ -593,10 +681,19 @@ def test_dwarf_debug_information(): A = te.placeholder((n,), name="A") B = te.placeholder((n,), name="B") C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=4) - s[C].parallel(xo) - s[C].vectorize(xi) + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B, C]) + sch = tir.Schedule(mod) + + # Get block and loop + block = sch.get_block("C") + loop = sch.get_loops(block)[0] + + # Split and parallelize + xo, xi = sch.split(loop, factors=[None, 4]) + sch.parallel(xo) + sch.vectorize(xi) def check_llvm_object(): if tvm.target.codegen.llvm_version_major() < 5: @@ -604,9 +701,13 @@ def check_llvm_object(): if tvm.target.codegen.llvm_version_major() > 6: return # build two functions - f2 = tvm.lower(s, [A, B, C], name="fadd1") - f1 = tvm.lower(s, [A, B, C], name="fadd2") - m = tvm.build([f1, f2], "llvm") + mod = tvm.IRModule( + { + "fadd1": sch.mod["main"].with_attr("global_symbol", "fadd1"), + "fadd2": sch.mod["main"].with_attr("global_symbol", "fadd2"), + } + ) + m = tvm.build(mod, target="llvm") temp = utils.tempdir() o_path = temp.relpath("temp.o") m.save(o_path) @@ -638,9 +739,13 @@ def check_llvm_ir(): if tvm.target.codegen.llvm_version_major() > 6: return # build two functions - f2 = tvm.lower(s, [A, B, C], name="fadd1") - f1 = tvm.lower(s, [A, B, C], name="fadd2") - m = tvm.build([f1, f2], target="llvm -mtriple=aarch64-linux-gnu") + mod = tvm.IRModule( + { + "fadd1": sch.mod["main"].with_attr("global_symbol", "fadd1"), + "fadd2": sch.mod["main"].with_attr("global_symbol", "fadd2"), + } + ) + m = tvm.build(mod, target="llvm -mtriple=aarch64-linux-gnu") ll = m.get_source("ll") # On non-Darwin OS, don't explicitly specify DWARF version. @@ -650,7 +755,7 @@ def check_llvm_ir(): assert re.search(r"""llvm.dbg.value""", ll) # Try Darwin, require DWARF-2 - m = tvm.build([f1, f2], target="llvm -mtriple=x86_64-apple-darwin-macho") + m = tvm.build(mod, target="llvm -mtriple=x86_64-apple-darwin-macho") ll = m.get_source("ll") assert re.search(r"""i32 4, !"Dwarf Version", i32 2""", ll) assert re.search(r"""llvm.dbg.value""", ll) @@ -664,7 +769,10 @@ def test_llvm_shuffle(): a = te.placeholder((8,), "int32") b = te.placeholder((8,), "int32") c = te.compute((8,), lambda x: a[x] + b[7 - x]) - sch = te.create_schedule(c.op) + + # Convert to TIR and create schedule + mod = te.create_prim_func([a, b, c]) + sch = tir.Schedule(mod) def my_vectorize(): def vectorizer(op): @@ -685,8 +793,8 @@ def _transform(f, *_): return tvm.tir.transform.prim_func_pass(_transform, opt_level=0, name="my_vectorize") with tvm.transform.PassContext(config={"tir.add_lower_pass": [(1, my_vectorize())]}): - ir = tvm.lower(sch, [a, b, c], simple_mode=True) - module = tvm.build(sch, [a, b, c]) + ir = tvm.lower(sch.mod, simple_mode=True) + module = tvm.build(sch.mod) a_ = tvm.nd.array(np.arange(1, 9, dtype="int32")) b_ = tvm.nd.array(np.arange(8, 0, -1, dtype="int32")) c_ = tvm.nd.array(np.zeros((8,), dtype="int32")) @@ -727,12 +835,21 @@ def dotest(do_vectorize): np.random.seed(122) A = te.placeholder((32,), dtype="bfloat16") B = te.placeholder((32,), dtype="bfloat16") - d = te.compute((32,), lambda x: A[x] + B[x]) - sch = te.create_schedule(d.op) + D = te.compute((32,), lambda x: A[x] + B[x], name="D") + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B, D]) + sch = tir.Schedule(mod) + + # Get block and loop + block = sch.get_block("D") + loop = sch.get_loops(block)[0] + + # Apply vectorization if requested if do_vectorize: - sch[d].vectorize(d.op.axis[0]) + sch.vectorize(loop) - module = tvm.build(sch, [A, B, d]) + module = tvm.build(sch.mod, target="llvm") npa = np.random.rand(32).astype("float32") npb = np.random.rand(32).astype("float32") va = np_bf16_cast_and_cast_back(npa) @@ -762,72 +879,6 @@ def test_llvm_crt_static_lib(): module.save("test.o") -def atomic_add(x, y): - return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y) - - -@tvm.testing.requires_llvm -def test_llvm_lower_atomic(): - def do_atomic_add(A): - ib = tvm.tir.ir_builder.create() - n = A.shape[0] - atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local") - one = tvm.tir.const(1, A.dtype) - A_ptr = ib.buffer_ptr(A) - with ib.for_range(0, n, name="i", kind="parallel") as i: - atomic_add_return[0] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one - ) - return ib.get() - - A = tvm.te.placeholder((100,), dtype="int32", name="A") - C = tvm.te.extern((100,), [A], lambda ins, _: do_atomic_add(ins[0]), name="C", dtype="int32") - s = tvm.te.create_schedule(C.op) - # This does not work because of pointer type mismatch - # TVMError: LLVM module verification failed with the following errors: - # Argument value type does not match pointer operand type! - # %21 = atomicrmw add i8* %7, i32 1 monotonic - # i8 - # f = tvm.build(s, [A], target="llvm") - - -@tvm.testing.requires_llvm -@tvm.testing.requires_gpu -def test_llvm_gpu_lower_atomic(): - def do_atomic_add(A): - ib = tvm.tir.ir_builder.create() - n = A.shape[0] - atomic_add_return = ib.allocate(A.dtype, (1,), name="atomic_add_return", scope="local") - one = tvm.tir.const(1, A.dtype) - A_ptr = ib.buffer_ptr(A) - nthread_tx = 64 - with ib.new_scope(): - nthread_bx = (n + nthread_tx - 1) // nthread_tx - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) - atomic_add_return[0] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", A_ptr[0]), one - ) - return ib.get() - - size = 1024 - # CI uses LLVM 8, which does not support float atomic - for dtype in ["int32"]: - A = tvm.te.placeholder((size,), dtype=dtype, name="A") - C = tvm.te.extern((size,), [A], lambda ins, _: do_atomic_add(ins[0]), dtype=dtype) - s = tvm.te.create_schedule(C.op) - f = tvm.build(s, [A], target="nvptx") - - dev = tvm.cuda() - a = tvm.nd.array(np.zeros((size,)).astype(A.dtype), dev) - f(a) - ref = np.zeros((size,)).astype(A.dtype) - ref[0] = size - tvm.testing.assert_allclose(a.numpy(), ref, rtol=1e-5) - - @tvm.testing.requires_llvm def test_llvm_order_functions(): """Check that functions in the LLVM module are ordered alphabetically.""" @@ -850,7 +901,7 @@ def make_call_extern(caller, callee): "Kirby": make_call_extern("Kirby", "Fred"), } mod = tvm.IRModule(functions=functions) - ir_text = tvm.build(mod, None, target="llvm").get_source("ll") + ir_text = tvm.build(mod, target="llvm").get_source("ll") # Skip functions whose names start with _. matches = re.findall(r"^define[^@]*@([a-zA-Z][a-zA-Z0-9_]*)", ir_text, re.MULTILINE) assert matches == sorted(matches) @@ -879,13 +930,14 @@ def check_llvm(use_file): temp = utils.tempdir() ll_path = temp.relpath("temp.ll") ll_code = clang.create_llvm(cc_code, output=ll_path) - s = te.create_schedule(B.op) + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + if use_file: - s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path) + sch.annotate(sch.get_loops("B")[0], "pragma_import_llvm", ll_path) else: - s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code) + sch.annotate(sch.get_loops("B")[0], "pragma_import_llvm", ll_code) # BUILD and invoke the kernel. - f = tvm.build(s, [A, B], "llvm") + f = tvm.build(sch.mod, target="llvm") dev = tvm.cpu(0) # launch the kernel. a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) @@ -910,7 +962,7 @@ def test_llvm_scalar_concat(): # This will crash in LLVM codegen if CodeGenLLVM::CreateVecConcat doesn't convert # scalars to single-lane LLVM vectors. with tvm.transform.PassContext(config={"tir.disable_assert": True}): - m = tvm.build(mod, [x, y, z], target="llvm") + m = tvm.build(mod, target="llvm") @tvm.testing.requires_llvm @@ -925,7 +977,7 @@ def threadpool_nested_parallel_loop( B[i, j] = A[i, j] * 2.0 with pytest.raises(tvm.TVMError) as e: - tvm.build({"llvm": tvm.IRModule.from_expr(threadpool_nested_parallel_loop)}) + tvm.build(tvm.IRModule.from_expr(threadpool_nested_parallel_loop), target="llvm") msg = str(e) assert msg.find("Nested parallel loop is not supported") != -1 @@ -939,13 +991,16 @@ def test_llvm_target_attributes(): A = te.placeholder((n,), name="A", dtype="float32") B = te.compute((n,), lambda i: A[i], name="B") C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], nparts=2) - s[C].parallel(xo) + + sch = tvm.tir.Schedule( + te.create_prim_func([A, B, C, n]).with_attr("global_symbol", "test_func") + ) + xo, xi = sch.split(sch.get_loops("C")[0], factors=[2, None]) + sch.parallel(xo) target_llvm = "llvm -mtriple=x86_64-linux-gnu -mcpu=skylake -mattr=+avx512f" target = tvm.target.Target(target_llvm, host=target_llvm) - module = tvm.build(s, [A, B, C, n], target=target, name="test_func") + module = tvm.build(sch.mod, target=target) llvm_ir = module.get_source() llvm_ir_lines = llvm_ir.split("\n") @@ -996,7 +1051,7 @@ def tir_assume_func(A: T.Buffer((4, 4), "int32"), B: T.Buffer((14,), "int32")): mod = tvm.IRModule.from_expr(tir_assume_func) inp = te.placeholder((4, 4), name="A", dtype="int32") out = te.placeholder((14,), name="B", dtype="int32") - m = tvm.build(mod, [inp, out], target="llvm") + m = tvm.build(mod, target="llvm") @tvm.testing.requires_llvm diff --git a/tests/python/codegen/test_target_codegen_opencl.py b/tests/python/codegen/test_target_codegen_opencl.py index 079553665ffb..90af959472c5 100644 --- a/tests/python/codegen/test_target_codegen_opencl.py +++ b/tests/python/codegen/test_target_codegen_opencl.py @@ -135,9 +135,12 @@ def test_opencl_erf(): def check_erf(dev, n, dtype): A = te.placeholder((n,), name="A", dtype=dtype) C = te.compute(A.shape, lambda *i: te.erf(A(*i)), name="C") - s = te.create_schedule(C.op) - s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) - fun = tvm.build(s, [A, C], target) + func = te.create_prim_func([A, C]) + sch = tvm.tir.Schedule(func) + (x,) = sch.get_loops(sch.get_block("C")) + sch.bind(x, "threadIdx.x") + fun = tvm.build(sch.mod, target=target) + source_str = fun.imported_modules[0].get_source() matches = re.findall("erf", source_str) error_matches = re.findall("erff", source_str) diff --git a/tests/python/codegen/test_target_codegen_rocm.py b/tests/python/codegen/test_target_codegen_rocm.py index a0990c330f03..4c7592034ef0 100644 --- a/tests/python/codegen/test_target_codegen_rocm.py +++ b/tests/python/codegen/test_target_codegen_rocm.py @@ -18,41 +18,8 @@ import tvm.testing from tvm import te import numpy as np -import unittest from tvm.script import tir as T -tx = te.thread_axis("threadIdx.x") -ty = te.thread_axis("threadIdx.y") -bx = te.thread_axis("blockIdx.x") -by = te.thread_axis("blockIdx.y") - - -@tvm.testing.requires_rocm -def test_rocm_cross_thread_reduction(): - # based on the reduction tutorial - n = te.size_var("n") - m = te.size_var("m") - A = te.placeholder((n, m), name="A") - k = te.reduce_axis((0, m), "k") - B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B") - s = te.create_schedule(B.op) - ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) - BF = s.rfactor(B, ki) - xo, xi = s[B].split(s[B].op.axis[0], factor=32) - s[B].bind(xo, bx) - s[B].bind(xi, ty) - s[B].bind(s[B].op.reduce_axis[0], tx) - s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) - s[B].set_store_predicate(tx.var.equal(0)) - frocm = tvm.build(s, [A, B], "rocm") - - nn = 128 - dev = tvm.rocm(0) - a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev) - frocm(a, b) - tvm.testing.assert_allclose(b.numpy(), np.sum(a.numpy(), axis=1), rtol=1e-4) - @tvm.testing.requires_rocm def test_rocm_inf_nan(): @@ -60,9 +27,11 @@ def check_inf_nan(dev, n, value, dtype): A = te.placeholder((n,), name="A", dtype=dtype) inf_value = tvm.tir.const(value, dtype=dtype) C = te.compute((n,), lambda i: inf_value, name="C") - s = te.create_schedule(C.op) - s[C].bind(s[C].op.axis[0], tx) - fun = tvm.build(s, [A, C], "rocm") + sch = tvm.tir.Schedule(te.create_prim_func([A, C])) + xo, xi = sch.split(sch.get_loops("C")[0], factors=[None, 128]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") + fun = tvm.build(sch.mod, "rocm") a = tvm.nd.empty((n,), A.dtype, dev) c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here @@ -78,19 +47,6 @@ def check_inf_nan(dev, n, value, dtype): check_inf_nan(dev, 1, float("nan"), "float64") -@tvm.testing.requires_rocm -def test_rocm_reduction_binding(): - k = te.reduce_axis((0, 32), "k") - A = te.placeholder((96, 32), name="A") - B = te.compute((96,), lambda m: te.sum(A[m, k], axis=k), name="B") - s = te.create_schedule(B.op) - - s[B].reorder(B.op.reduce_axis[0], B.op.axis[0]) - - mo, _ = s[B].split(B.op.axis[0], 32) - s[B].bind(mo, bx) - - @tvm.testing.requires_rocm def test_rocm_copy(): def check_rocm(dtype, n): @@ -116,11 +72,12 @@ def test_rocm_vectorize_add(): def check_rocm(dtype, n, lanes): A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes)) B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B") - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(xo, bx) - s[B].bind(xi, tx) - fun = tvm.build(s, [A, B], "rocm") + sch = tir.Schedule(te.create_prim_func([A, B])) + xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 4]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") + fun = tvm.build(sch.mod, target="rocm") + dev = tvm.rocm(0) a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), B.dtype, dev) @@ -179,13 +136,3 @@ def func( b = tvm.nd.array(np.zeros((4,)).astype("float32"), dev) mod(a, b) tvm.testing.assert_allclose(b.numpy(), np.exp2(a.numpy())) - - -if __name__ == "__main__": - test_rocm_cross_thread_reduction() - test_rocm_inf_nan() - test_rocm_reduction_binding() - test_rocm_copy() - test_rocm_vectorize_add() - test_rocm_warp_shuffle() - test_rocm_vectorized_exp() diff --git a/tests/python/codegen/test_target_codegen_vulkan.py b/tests/python/codegen/test_target_codegen_vulkan.py index 9d00f047cb69..0e1aa1a0403b 100644 --- a/tests/python/codegen/test_target_codegen_vulkan.py +++ b/tests/python/codegen/test_target_codegen_vulkan.py @@ -26,7 +26,7 @@ import tvm import tvm.testing -from tvm import te +from tvm import te, tir from tvm.topi.math import cast from tvm.script import tir as T, ir as I from tvm.tir import TensorIntrin, IntImm, Cast, Schedule @@ -60,9 +60,10 @@ ] ) ) -def test_vector_comparison(target, dtype): - n = (1024,) - A = te.placeholder(n, dtype=dtype, name="A") +def test_vector_comparison(target, dev, dtype): + target = tvm.target.Target(target) + n = 1024 + A = te.placeholder((n,), dtype=dtype, name="A") B = te.compute( A.shape, lambda i: tvm.tir.Select( @@ -70,14 +71,18 @@ def test_vector_comparison(target, dtype): ), name="B", ) - s = te.create_schedule(B.op) - (bx, tx) = s[B].split(s[B].op.axis[0], factor=128) - (tx, vx) = s[B].split(tx, factor=4) - s[B].bind(bx, te.thread_axis("blockIdx.x")) - s[B].bind(tx, te.thread_axis("threadIdx.x")) - s[B].vectorize(vx) - f = tvm.build(s, [A, B], target) + # Create IRModule + mod = tvm.IRModule.from_expr(te.create_prim_func([A, B])) + sch = tir.Schedule(mod) + (bx, tx) = sch.split(sch.get_loops("B")[0], factors=[None, 128]) + (tx, vx) = sch.split(tx, factors=[None, 4]) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + sch.vectorize(vx) + + # Build + f = tvm.build(sch.mod, target=target) # Verify we generate the boolx4 type declaration and the OpSelect # v4{float,half,int} instruction @@ -102,133 +107,48 @@ def test_array_copy(dev, dtype, fuzz_seed): @tvm.testing.exclude_targets("llvm") def test_array_vectorize_add(target, dev, dtype): + target = tvm.target.Target(target) arr_size = 64 lanes = 2 - if "opencl" in target and dtype == "float16": - pytest.xfail("Opencl target does not support float16") - num_thread = 8 + if "opencl" in str(target) and dtype == "float16": + pytest.xfail("Opencl target does not support float16") A = te.placeholder((arr_size,), name="A", dtype="%sx%d" % (dtype, lanes)) - B = te.compute((arr_size,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B") - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[B].bind(xi, te.thread_axis("threadIdx.x")) - fun = tvm.build(s, [A, B], target) + B = te.compute(A.shape, lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B") + + sch = tir.Schedule(te.create_prim_func([A, B])) + xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 4]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") + f = tvm.build(sch.mod, target=target) + a = tvm.nd.empty((arr_size,), A.dtype, dev).copyfrom(np.random.uniform(size=(arr_size, lanes))) c = tvm.nd.empty((arr_size,), B.dtype, dev) - fun(a, c) + f(a, c) tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1) -@tvm.testing.parametrize_targets("vulkan") -@pytest.mark.skip("Flaky, https://github.com/apache/tvm/issues/10779") -def test_vulkan_stress(target, dev): - """ - Launch a randomized test with multiple kernels per stream, multiple uses of - kernels per stream, over multiple threads. - """ - - n = 1024 - num_thread = 64 - - def run_stress(): - def worker(): - A = te.placeholder((n,), name="A", dtype="float32") - B = te.placeholder((n,), name="B", dtype="float32") - functions = [ - ( - lambda: te.compute((n,), lambda i: 2 * A[i] + 3 * B[i]), - lambda a, b: 2 * a + 3 * b, - ), - (lambda: te.compute((n,), lambda i: A[i] + B[i]), lambda a, b: a + b), - (lambda: te.compute((n,), lambda i: A[i] + 2 * B[i]), lambda a, b: a + 2 * b), - ] - - def build_f(f_ref): - (C_f, ref) = f_ref - C = C_f() - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=num_thread) - s[C].bind(xo, te.thread_axis("blockIdx.x")) - s[C].bind(xi, te.thread_axis("threadIdx.x")) - fun = tvm.build(s, [A, B, C], target) - return (fun, ref) - - fs = [ - build_f(random.choice(functions)) for _ in range(np.random.randint(low=1, high=10)) - ] - a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n,))) - b = tvm.nd.empty((n,), B.dtype, dev).copyfrom(np.random.uniform(size=(n,))) - cs = [tvm.nd.empty((n,), A.dtype, dev) for _ in fs] - for (f, _), c in zip(fs, cs): - f(a, b, c) - - for (_, ref), c in zip(fs, cs): - tvm.testing.assert_allclose(c.numpy(), ref(a.numpy(), b.numpy())) - - ts = [threading.Thread(target=worker) for _ in range(np.random.randint(1, 10))] - for t in ts: - t.start() - for t in ts: - t.join() - - run_stress() - - @tvm.testing.exclude_targets("llvm") def test_vulkan_bool_load(target, dev): - arr_size = 1024 - target = tvm.target.Target(target) - if target.kind.name == "vulkan": - supports_int8_buffer = target.attrs.get("supports_int8", False) and target.attrs.get( - "supports_8bit_buffer", False - ) - if not supports_int8_buffer: - pytest.xfail( - "Vulkan target does not support int8 buffer access, used to transfer booleans" - ) - - def do_copy(A, B, n): - ib = tvm.tir.ir_builder.create() - A = ib.buffer_ptr(A) - B = ib.buffer_ptr(B) - - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - - max_threads = 32 - ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads)) - ib.scope_attr(tx, "thread_extent", max_threads) - tid = bx * max_threads + tx - - with ib.if_scope(tid < n): - B[tid] = cast(A[tid], "int32") - - return ib.get() - + arr_size = 1024 A = te.placeholder((arr_size,), name="A", dtype="bool") - B = te.placeholder((arr_size,), name="B", dtype="int32") + B = te.compute(A.shape, lambda i: A[i].astype("int32"), name="B") - B = te.extern( - A.shape, - [A], - lambda ins, outs: do_copy(ins[0], outs[0], arr_size), - name="bool_copy_ir", - dtype="int32", - ) - s = te.create_schedule(B.op) + sch = tir.Schedule(te.create_prim_func([A, B])) + xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 128]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") - with tvm.transform.PassContext(opt_level=3): - func = tvm.build(s, [A, B], target) + # Build + f = tvm.build(sch.mod, target=target) a_np = np.random.uniform(size=arr_size) > 0.5 b_np = np.zeros((arr_size,), dtype="int32") a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) - func(a, b) + f(a, b) ref = a_np.astype(np.int32) tvm.testing.assert_allclose(b.numpy(), ref) @@ -270,11 +190,11 @@ def test_vulkan_constant_passing(target, dev, vulkan_parameter_impl, vulkan_para A = te.placeholder((n,), name="A", dtype=dtype) B = te.compute(A.shape, lambda i: scalar_sum + A[i], name="B") - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=64) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[B].bind(xi, te.thread_axis("threadIdx.x")) - f_add = tvm.build(s, scalars + [A, B], target) + sch = tvm.tir.Schedule(te.create_prim_func(scalars + [A, B])) + xo, xi = sch.split(sch.get_loops("B")[0], factors=[None, 64]) + sch.bind(xo, "blockIdx.x") + sch.bind(xi, "threadIdx.x") + f_add = tvm.build(sch.mod, target=target) n = 1024 scalars = np.array([1 for _ in scalars]).astype(dtype) @@ -287,6 +207,9 @@ def test_vulkan_constant_passing(target, dev, vulkan_parameter_impl, vulkan_para def test_vulkan_while_if(target, dev): target = tvm.target.Target(target) + n = 1 + dtype = "int32" + A = te.placeholder((n,), name="A", dtype=dtype) def do_compute(A, B, n): ib = tvm.tir.ir_builder.create() @@ -300,9 +223,6 @@ def do_compute(A, B, n): iterations[0] = 0 B[0] = 0 - # WhileNode's condition is re-evaluated every loop. The - # if_then_else block introduces additional labels/blocks that - # must be kept separate from the WhileNode's block. loop_condition = iterations[0] < tvm.tir.if_then_else(A[0] > 0, 10, 20) with ib.while_loop(loop_condition): iterations[0] += 1 @@ -310,21 +230,19 @@ def do_compute(A, B, n): return ib.get() - n = 1 - dtype = "int32" - A = te.placeholder((n,), name="A", dtype=dtype) - B = te.extern( A.shape, [A], lambda ins, outs: do_compute(ins[0], outs[0], n), dtype=dtype, ) - s = te.create_schedule(B.op) - # Point of failure would be here, at tvm.build. - with tvm.transform.PassContext(opt_level=3): - func = tvm.build(s, [A, B], target) + # Create IRModule + mod = tvm.IRModule.from_expr(te.create_prim_func([A, B])) + sch = tir.Schedule(mod) + + # Build + func = tvm.build(sch.mod, target=target) a = tvm.nd.array(np.array([5], dtype=A.dtype), dev) b = tvm.nd.array(np.zeros(n, dtype=A.dtype), dev) @@ -339,52 +257,40 @@ def do_compute(A, B, n): @tvm.testing.exclude_targets("llvm") def test_vulkan_local_threadidx(target, dev): - # To access the thread index, the vulkan runtime accesses a global - # array of thread indices, storing the result in a local variable. - # In CUDA, these are the built-in threadIdx.x variables, which are - # globally accessible. In vulkan, these local variables must be - # defined inside a function, but are hoisted up to the function - # header to mimic the global CUDA semantics. Before this - # hoisting, this test could trigger spvValidate errors for - # potentially undeclared variables. + target = tvm.target.Target(target) + n = 32 + A = te.placeholder((n,), name="A", dtype="int32") def do_compute(A, B, n): ib = tvm.tir.ir_builder.create() A = ib.buffer_ptr(A) B = ib.buffer_ptr(B) - # One single declaration of te.thread_axis. tx = te.thread_axis("threadIdx.x") with ib.for_range(0, 1): - # Used inside a for-loop scope, defines local thread_id - # variable. ib.scope_attr(tx, "thread_extent", 16) B[tx + 0] = A[tx + 0] with ib.for_range(0, 1): - # Used in next scope. If local variable defined at point - # of use instead of function header, will fail spvValidate - # for access of out-of-scope local variable. ib.scope_attr(tx, "thread_extent", 16) B[tx + 16] = A[tx + 16] return ib.get() - n = te.var("n") - A = te.placeholder((n,), name="A", dtype="int32") - B = te.placeholder((n,), name="B", dtype="int32") - B = te.extern( A.shape, [A], lambda ins, outs: do_compute(ins[0], outs[0], n), dtype="int32", ) - s = te.create_schedule(B.op) - # Expected failure occurs at build step. - func = tvm.build(s, [A, B], target) + # Create IRModule + mod = tvm.IRModule.from_expr(te.create_prim_func([A, B])) + sch = tir.Schedule(mod) + + # Build + func = tvm.build(sch.mod, target=target) n = 32 a_np = np.arange(n).astype(dtype=A.dtype) @@ -473,9 +379,8 @@ def do_compute(ins, outs): return ib.get() B = te.extern(A.shape, [A, R], do_compute, dtype="int32") - s = te.create_schedule(B.op) - return tvm.lower(s, [A, R, B]) + return tvm.IRModule.from_expr(te.create_prim_func([A, R, B])) def test_ramp_broadcast_index(self, target, dev, mod, ref_data): f = tvm.build(mod, target=target) @@ -488,36 +393,6 @@ def test_ramp_broadcast_index(self, target, dev, mod, ref_data): tvm.testing.assert_allclose(b.numpy(), b_np) -@tvm.testing.parametrize_targets("vulkan -max_shared_memory_per_block=16384") -def test_shared_mem_alloc(target, dev): - alloc_nbytes = 16384 * 2 - - def do_compute(ins, outs): - ib = tvm.tir.ir_builder.create() - out = ib.buffer_ptr(outs[0]) - - ib.scope_attr(te.thread_axis("blockIdx.x"), "thread_extent", 0) - - array = ib.allocate("int32", (alloc_nbytes,), name="array", scope="shared") - array[0] = 0 - out[0] = array[0] - - return ib.get() - - Out = te.extern( - shape=(1,), - inputs=[], - fcompute=do_compute, - dtype="int32", - ) - s = te.create_schedule(Out.op) - - # Codegen should raise error when allocating more memory than the - # target supports. - with pytest.raises(tvm.TVMError): - tvm.build(s, [Out], target) - - def test_negative_operand_divmod(target, dev): """Test handling of negative offsets to floormod/floordiv diff --git a/tests/python/codegen/test_target_codegen_x86.py b/tests/python/codegen/test_target_codegen_x86.py index a276940050b1..f433964f7f5d 100644 --- a/tests/python/codegen/test_target_codegen_x86.py +++ b/tests/python/codegen/test_target_codegen_x86.py @@ -38,9 +38,9 @@ def fp16_to_fp32(target, width, match=None, not_match=None): n = tvm.runtime.convert(elements) A = te.placeholder((n, width), dtype="float16", name="A") B = te.compute(A.shape, lambda *i: A(*i).astype("float32"), name="B") - s = te.create_schedule(B.op) - s[B].vectorize(s[B].op.axis[1]) - f = tvm.build(s, [A, B], target) + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) + sch.vectorize(sch.get_loops("B")[1]) + f = tvm.build(sch.mod, target=target) assembly = f.get_source("asm").splitlines() if match: diff --git a/tests/python/contrib/test_cblas.py b/tests/python/contrib/test_cblas.py index 3c90aefeb67a..b8851e685b13 100644 --- a/tests/python/contrib/test_cblas.py +++ b/tests/python/contrib/test_cblas.py @@ -39,7 +39,6 @@ def verify_matmul_add( final_result = te.compute( matmul_result.shape, lambda i, j: matmul_result[i, j] + bias, name="final_result" ) - s = te.create_schedule(final_result.op) def get_numpy(a, b, matrix_bias, transa, transb): if transa: @@ -64,7 +63,12 @@ def verify(target="llvm"): return dev = tvm.cpu(0) name = "test_matmul_add" - f = tvm.build(s, [input1_data, input2_data, final_result, bias], target, name=name) + f = tvm.build( + te.create_prim_func([input1_data, input2_data, final_result, bias]).with_attr( + "global_symbol", name + ), + target=target, + ) if target == "c": f = compiling(f, name) matrix_input1 = tvm.nd.array(np.random.uniform(size=ashape).astype(input1_data.dtype), dev) @@ -126,7 +130,6 @@ def verify_quantized_matmul_add(matrix_m, matrix_l, matrix_n, transa=False, tran final_result = te.compute( matmul_result.shape, lambda i, j: matmul_result[i, j] + bias, name="final_result" ) - s = te.create_schedule(final_result.op) def get_numpy(a, b, matrix_bias, transa, transb): if transa: @@ -143,7 +146,9 @@ def verify(target="llvm"): print("skip because extern function is not available") return dev = tvm.cpu(0) - f = tvm.build(s, [input1_data, input2_data, final_result, bias], target) + f = tvm.build( + te.create_prim_func([input1_data, input2_data, final_result, bias]), target=target + ) matrix_input1 = tvm.nd.array( np.random.randint(low=0, high=50, size=ashape).astype(input1_data.dtype), dev ) @@ -201,7 +206,6 @@ def verify_batch_matmul( final_result = te.compute( matmul_result.shape, lambda k, i, j: matmul_result[k, i, j], name="final_result" ) - s = te.create_schedule(final_result.op) def get_numpy(a, b, transa, transb): if transa: @@ -226,7 +230,7 @@ def verify(target="llvm"): return dev = tvm.cpu(0) name = "test_batch_matmul" - f = tvm.build(s, [input1_data, input2_data, final_result], target, name=name) + f = tvm.build(te.create_prim_func([input1_data, input2_data, final_result]), target=target) if target == "c": f = compiling(f, name) matrix_input1 = tvm.nd.array(np.random.uniform(size=ashape).astype(input1_data.dtype), dev) diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py index 4e65f79c518e..70277cb0ca0a 100644 --- a/tests/python/contrib/test_dlpack.py +++ b/tests/python/contrib/test_dlpack.py @@ -49,10 +49,9 @@ def verify_torch_dlpack(): k = te.reduce_axis((0, n), name="k") ZZ = te.compute((n, n), lambda i, j: te.sum(XX[i, k] * YY[k, j], axis=k)) - s = te.create_schedule(ZZ.op) # No need to speficy target_host if it's llvm # Otherwise you will need to specify the target and target_host - f = tvm.build(s, [XX, YY, ZZ], name="f") + f = tvm.build(te.create_prim_func([XX, YY, ZZ])) f_pytorch = to_pytorch_func(f) zz2 = torch.empty(137, 137) diff --git a/tests/python/contrib/test_gemm_acc16.py b/tests/python/contrib/test_gemm_acc16.py deleted file mode 100644 index 18e15098a07e..000000000000 --- a/tests/python/contrib/test_gemm_acc16.py +++ /dev/null @@ -1,105 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=import-self, invalid-name, unused-argument, too-many-lines, len-as-condition -import tvm -from tvm import te -import numpy as np -from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int16 - - -def benchmark_fc_int8_acc16(): - m = 128 - n = 128 - k = 128 - - X = te.placeholder((m, k), name="X", dtype="uint8") - W = te.placeholder((n, k), name="W", dtype="int8") - - peak = 512 / 16 * 2 * 2 * 2 - gops_per_mm = 2 * n * m * k - print("Peak {} Gops/s \n".format(peak)) - - def verify(target="llvm -mcpu=skylake-avx512"): - if not tvm.runtime.enabled(target): - print("skip because %s is not enabled..." % target) - return - - dev = tvm.device(target, 0) - X = te.placeholder((m, k), name="X", dtype="uint8") - W = te.placeholder((n, k), name="W", dtype="int8") - pc = dot_16x1x16_uint8_int8_int16() - ak = te.reduce_axis((0, k), name="k") - - packedW = te.placeholder((n // 128, 128 * (k // 2), 2), name="packedW", dtype="int8") - t_fc = te.compute( - (m, n), - lambda i, j: te.sum( - X[i, ak].astype("int16") - * packedW[j // 128, (ak // 2) * 128 + j % 128, ak % 2].astype("int16"), - axis=ak, - ), - name="F", - ) - - t_sch = te.create_schedule(t_fc.op) - a_x, a_y = t_fc.op.axis - (a_k,) = t_fc.op.reduce_axis - - a_yo, a_yi = t_sch[t_fc].split(a_y, factor=128) - a_ko, a_ki = t_sch[t_fc].split(a_k, factor=2) - - a_xo, a_xi = t_sch[t_fc].split(a_x, factor=128) - a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=32) - t_sch[t_fc].reorder(a_yo, a_xo, a_koo, a_xi, a_koi, a_yi, a_ki) - - t_sch[t_fc].tensorize(a_yi, pc) - # print(tvm.lower(t_sch, [X, packedW, t_fc], simple_mode=True)) - t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic") - t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10) - - # generate the plain data - a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8") - b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8") - - packW = np.random.uniform(1, 10, size=(n // 128, 128 * (k // 2), 2)).astype("int8") - # This occurs in pre_compute stage - for r_idx in range(n // 128): - for s_idx in range(128 * (k // 2)): - for t_idx in range(2): - packW[r_idx][s_idx][t_idx] = b_[r_idx * 128 + s_idx % 128][ - s_idx // 128 * 2 + t_idx - ] - - x = tvm.nd.array(a_, dev) - w = tvm.nd.array(packW, dev) - y = tvm.nd.array(np.zeros((m, n), dtype="int16"), dev) - - result = t_evaluator(x, w, y) - gops_per_sec = gops_per_mm / result.mean / 1e9 - tvm.testing.assert_allclose(y.numpy(), np.dot(a_, b_.T), rtol=1e-5) - print( - "Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}.".format( - result.mean * 1000, gops_per_sec, gops_per_sec / peak - ) - ) - # t_func.export_library("gemm_tensorize.o") - - verify() - - -if __name__ == "__main__": - benchmark_fc_int8_acc16() diff --git a/tests/python/contrib/test_gemm_acc32_vnni.py b/tests/python/contrib/test_gemm_acc32_vnni.py deleted file mode 100644 index 2e15d38612ce..000000000000 --- a/tests/python/contrib/test_gemm_acc32_vnni.py +++ /dev/null @@ -1,115 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import tvm -import tvm.testing -from tvm import te -import numpy as np -from tvm.topi.x86.tensor_intrin import dot_16x1x16_uint8_int8_int32 - - -def verify_fc_int8_acc32(m=1024, n=1024, k=1024, target="llvm -mcpu=cascadelake"): - X = te.placeholder((m, k), name="X", dtype="uint8") - # W = te.placeholder((n, k), name="W", dtype="int8") - - if not tvm.testing.device_enabled(target): - print("skip because %s is not enabled..." % target) - return - - dev = tvm.device(target, 0) - # workaround for Target.current() - with tvm.target.Target(target) as target: - pc = dot_16x1x16_uint8_int8_int32() - - ak = te.reduce_axis((0, k), name="k") - packedW = te.placeholder((n // 16, 16 * (k // 4), 4), name="packedW", dtype="int8") - - t_fc = te.compute( - (m, n), - lambda i, j: te.sum( - X[i, ak].astype("int32") - * packedW[ - tvm.tir.indexdiv(j, 16), tvm.tir.indexdiv(ak, 4) * 16 + j % 16, ak % 4 - ].astype("int32"), - axis=ak, - ), - name="F", - ) - t_sch = te.create_schedule(t_fc.op) - a_x, a_y = t_fc.op.axis - (a_k,) = t_fc.op.reduce_axis - - a_yo, a_yi = t_sch[t_fc].split(a_y, factor=16) - a_xo, a_xi = t_sch[t_fc].split(a_x, factor=32) - a_ko, a_ki = t_sch[t_fc].split(a_k, factor=4) - a_koo, a_koi = t_sch[t_fc].split(a_ko, factor=4) - t_sch[t_fc].reorder(a_yo, a_xo, a_xi, a_koo, a_koi, a_yi, a_ki) - - t_sch[t_fc].unroll(a_koi) - t_sch[t_fc].tensorize(a_yi, pc) - - t_func = tvm.build(t_sch, [X, packedW, t_fc], target, name="intrinsic") - t_evaluator = t_func.time_evaluator(t_func.entry_name, dev, number=10) - - # generate the plain data - a_ = np.random.uniform(1, 10, size=(m, k)).astype("uint8") - b_ = np.random.uniform(1, 10, size=(n, k)).astype("int8") - - packW = np.random.uniform(1, 10, size=(n // 16, 16 * (k // 4), 4)).astype("int8") - # This occurs in pre_compute stage - for r_idx in range(n // 16): - for s_idx in range(16 * (k // 4)): - for t_idx in range(4): - packW[r_idx][s_idx][t_idx] = b_[r_idx * 16 + s_idx % 16][(s_idx // 16) * 4 + t_idx] - - x = tvm.nd.array(a_, dev) - w = tvm.nd.array(packW, dev) - y = tvm.nd.array(np.zeros((m, n), dtype="int32"), dev) - result = t_evaluator(x, w, y) - - peak = 280 - print("Peak {} Gops/s".format(peak)) - # memory_ops = m * k + n * k + 2 * m * n - gops_per_mm = 2 * m * n * k - - gops_per_sec = gops_per_mm / result.mean / 1e9 - # verify the correctness - tvm.testing.assert_allclose(y.numpy(), np.dot(a_, b_.T), rtol=0) - print( - "Tensorization: running time: {:.3f} ms, {:.2f} Gops/s, effiency: {:.2f}".format( - result.mean * 1000, gops_per_sec, gops_per_sec / peak - ) - ) - # t_func.export_library("tensorize_acc32.o") - - -@tvm.testing.requires_x86_vnni -def test_fc_int8_acc32_vnni(): - # For LLVM < 8.0, it shows "'cascadelake' is not a recognized processor for this target - # (ignoring processor)" error with the following setting. After LLVM 8.0 is enabled in the - # test, we should use cascadelake setting. - verify_fc_int8_acc32() - - -@tvm.testing.requires_x86_avx512 -def test_fc_int8_acc32_avx512(): - verify_fc_int8_acc32(target="llvm -mcpu=skylake-avx512") - - -if __name__ == "__main__": - test_fc_int8_acc32_vnni() - test_fc_int8_acc32_avx512() diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py deleted file mode 100644 index 07f6c2613dbc..000000000000 --- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_blocked.py +++ /dev/null @@ -1,207 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" Hexagon contrib tests for blocked conv2d """ - - -import numpy as np -import tvm -import tvm.testing -from tvm import te, topi -from tvm.topi import testing - -from ..infrastructure import ( - build_and_run, - conv2d_compute, - conv2d_verify, - get_block_shape, - get_packed_filter_shape, - get_packed_shape, -) - - -def conv2d_nhwc8h8w32c( - shape_input, - pad, - stride, - dilation, - shape_filter, - k_split_factor, - h_split_factor, - dtype, - storage_scope="global", -): - """ - Conv2d wherein the input activation is defined by its - logical NHWC layout. The filter is provided in its physical - packed layout (oihw8i32o4i). The input is padded and then packed - into its physical packed layout (nhwc8h8w32c). The resulting - computation is in the same physical packed layout (nhwc8h8w32c). - """ - - # nhwc layout - logical_input = te.placeholder(shape_input, dtype=dtype, name="logical_input") - - # oihw8i32o4i layout - filt_packed = te.placeholder(shape_filter, dtype=dtype, name="packed_filter") - - block_h, block_w, block_c = get_block_shape() - - # Calculate padded input - _, height, width, _ = shape_input - pad_h = (block_h - ((height + pad[1]) % block_h)) % block_h - pad_w = (block_w - ((width + pad[3]) % block_w)) % block_w - padded_input = topi.nn.pad( - logical_input, - [0, pad[0], pad[2], 0], - [0, pad_h, pad_w, 0], - pad_value=0, - name="padded_input", - ) - - # Calculate packed input - packed_shape = get_packed_shape(padded_input.shape) - packed_input = te.compute( - packed_shape, - lambda n, ho, wo, co, hi, wi, ci: padded_input[ - n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci - ], - name="packed_input", - ) - - output_shape, compute = conv2d_compute(packed_input, filt_packed, pad, stride, dilation) - packed_output = te.compute(output_shape, compute, name="packed_output") - s = te.create_schedule(packed_output.op) - - # Ensure the padding and array packing is performed inline - s[padded_input].compute_inline() - s[packed_input].compute_inline() - - # cache reads and writes - cached_input = s.cache_read(packed_input, storage_scope, [packed_output]) - cached_filt = s.cache_read(filt_packed, storage_scope, [packed_output]) - cached_output = s.cache_write(packed_output, storage_scope) - - # cache write schedule - batch, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output].op.axis - koo, koi = s[packed_output].split(k_outer, factor=k_split_factor) - hoo, hoi = s[packed_output].split(h_outer, factor=h_split_factor) - s[packed_output].reorder(batch, koo, hoo, koi, hoi, w_outer, h_inner, w_inner, k_inner) - s[cached_output].compute_at(s[packed_output], hoo) - - # compute schedule - batch, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[cached_output].op.axis - _, _, reduce_c = s[cached_output].op.reduce_axis - rco, rci = s[cached_output].split(reduce_c, factor=block_c) - koo, koi = s[cached_output].split(k_outer, factor=k_split_factor) - hoo, hoi = s[cached_output].split(h_outer, factor=h_split_factor) - s[cached_output].reorder( - batch, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci - ) - s[cached_input].compute_at(s[cached_output], hoo) - s[cached_filt].compute_at(s[cached_output], hoo) - - binds = {} - if storage_scope and storage_scope != "global": - with tvm.transform.PassContext(): - input_buffer = tvm.tir.decl_buffer( - packed_shape, name="Xb", dtype=dtype, scope=storage_scope - ) - output_buffer = tvm.tir.decl_buffer( - output_shape, name="Yb", dtype=dtype, scope=storage_scope - ) - binds = {logical_input: input_buffer, packed_output: output_buffer} - - return (s, [logical_input, filt_packed, packed_output], binds) - - -class BaseConv2d: - """Base class for conv2d tests""" - - # input - batch = tvm.testing.parameter(1) - in_size = tvm.testing.parameter(64) - in_channel = tvm.testing.parameter(64) - # conv2d - pad = tvm.testing.parameter(0) - stride = tvm.testing.parameter(1) - kernel_size = tvm.testing.parameter(1, 3) - out_channel = tvm.testing.parameter(128) - # schedule params - k_split_factor = tvm.testing.parameter(1, 2) - h_split_factor = tvm.testing.parameter(1, 2) - dtype = tvm.testing.parameter("float32") - - -class TestConv2dPackedFilter(BaseConv2d): - """Conv2d packed filter test class""" - - @tvm.testing.parametrize_targets("llvm") - @tvm.testing.skip_if_32bit(reason="Test known to be flaky on i386 machines") - def test_conv2d( - self, - batch, - in_size, - in_channel, - pad, - stride, - kernel_size, - out_channel, - k_split_factor, - h_split_factor, - dtype, - target, - ): - """conv2d test""" - # TODO: no support for dilation - dilation = 1 - - shape_input = [batch, in_size, in_size, in_channel] - shape_filter_oihw = [out_channel, in_channel, kernel_size, kernel_size] - shape_filter_oihw8i32o4i = get_packed_filter_shape(shape_filter_oihw) - - inputs = [ - np.random.uniform(0, 255, size=shape_input).astype(dtype), - np.random.uniform(0, 255, size=shape_filter_oihw8i32o4i).astype(dtype), - ] - np_filter = ( - inputs[1] - .transpose(0, 5, 1, 4, 6, 2, 3) - .reshape(shape_filter_oihw) - .transpose(2, 3, 1, 0) - ) - ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad) - output = build_and_run( - inputs, - conv2d_nhwc8h8w32c, - target, - target, - shape_input=shape_input, - pad=(pad, pad, pad, pad), - stride=(stride, stride), - dilation=(dilation, dilation), - shape_filter=shape_filter_oihw8i32o4i, - k_split_factor=k_split_factor, - h_split_factor=h_split_factor, - dtype=dtype, - ) - - conv2d_verify(output, ref_output, dtype) - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py b/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py deleted file mode 100644 index fa770c9be313..000000000000 --- a/tests/python/contrib/test_hexagon/conv2d/test_conv2d_conv2d.py +++ /dev/null @@ -1,252 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" back-to-back conv2d Hexagon test for stripe scheduling """ - - -import numpy as np -import tvm -import tvm.testing -from tvm import te, topi -from tvm.topi import testing - -from ..infrastructure import ( - build_and_run, - conv2d_compute, - conv2d_verify, - get_block_shape, - get_packed_filter_shape, - get_packed_shape, -) - - -def conv2dconv2d_nhwc8h8w32c( - shape_input, - pad1, - stride1, - dilation1, - shape_filter1, - pad2, - stride2, - dilation2, - shape_filter2, - k_split_factor, - h_split_factor, - dtype, - storage_scope="global", -): - """ - Conv2d -> Conv2d wherein the input activation is defined by its - logical NHWC layout. The filter is provided in its physical - packed layout (oihw8i32o4i). The input is padded and then packed - into its physical packed layout (nhwc8h8w32c). The resulting - computation is in the same physical packed layout (nhwc8h8w32c). - """ - - # nhwc layout - logical_input = te.placeholder(shape_input, dtype=dtype, name="logical_input") - - # oihw8i32o4i layout - filt_packed1 = te.placeholder(shape_filter1, dtype=dtype, name="packed_filter1") - filt_packed2 = te.placeholder(shape_filter2, dtype=dtype, name="packed_filter2") - - block_h, block_w, block_c = get_block_shape() - - # Calculate padded input - _, height, width, _ = shape_input - pad_h = (block_h - ((height + pad1[1]) % block_h)) % block_h - pad_w = (block_w - ((width + pad1[3]) % block_w)) % block_w - padded_input = topi.nn.pad( - logical_input, - [0, pad1[0], pad1[2], 0], - [0, pad_h, pad_w, 0], - pad_value=0, - name="padded_input", - ) - - # Calculate packed input - packed_shape = get_packed_shape(padded_input.shape) - packed_input = te.compute( - packed_shape, - lambda n, ho, wo, co, hi, wi, ci: padded_input[ - n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci - ], - name="packed_input", - ) - - output_shape1, compute1 = conv2d_compute(packed_input, filt_packed1, pad1, stride1, dilation1) - temp_output = te.compute(output_shape1, compute1, name="temp_output") - - output_shape2, compute2 = conv2d_compute(temp_output, filt_packed2, pad2, stride2, dilation2) - packed_output = te.compute(output_shape2, compute2, name="packed_output") - s = te.create_schedule(packed_output.op) - - # Ensure the padding and array packing is performed inline - s[padded_input].compute_inline() - s[packed_input].compute_inline() - - # cache reads and writes - packed_input_cached = s.cache_read(packed_input, storage_scope, [temp_output]) - filt_packed1_cached = s.cache_read(filt_packed1, storage_scope, [temp_output]) - filt_packed2_cached = s.cache_read(filt_packed2, storage_scope, [packed_output]) - packed_output_cached = s.cache_write(packed_output, storage_scope) - - # conv2d #1 schedule - n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[temp_output].op.axis - _, _, reduce_channel = s[temp_output].op.reduce_axis - rco, rci = s[temp_output].split(reduce_channel, factor=block_c) - koo, koi = s[temp_output].split(k_outer, factor=k_split_factor) - hoo, hoi = s[temp_output].split(h_outer, factor=h_split_factor) - s[temp_output].reorder(n, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci) - s[packed_input_cached].compute_at(s[temp_output], hoo) - s[filt_packed1_cached].compute_at(s[temp_output], hoo) - - # cache write schedule - n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output].op.axis - koo, koi = s[packed_output].split(k_outer, factor=k_split_factor) - hoo, hoi = s[packed_output].split(h_outer, factor=h_split_factor) - s[packed_output].reorder(n, koo, hoo, koi, hoi, w_outer, h_inner, w_inner, k_inner) - s[packed_output_cached].compute_at(s[packed_output], hoo) - - # conv2d #2 schedule - n, h_outer, w_outer, k_outer, h_inner, w_inner, k_inner = s[packed_output_cached].op.axis - _, _, reduce_channel = s[packed_output_cached].op.reduce_axis - rco, rci = s[packed_output_cached].split(reduce_channel, factor=block_c) - koo, koi = s[packed_output_cached].split(k_outer, factor=k_split_factor) - hoo, hoi = s[packed_output_cached].split(h_outer, factor=h_split_factor) - s[packed_output_cached].reorder( - n, koo, hoo, koi, hoi, w_outer, rco, h_inner, w_inner, k_inner, rci - ) - s[temp_output].compute_at(s[packed_output_cached], hoo) - s[filt_packed2_cached].compute_at(s[packed_output_cached], hoo) - - binds = {} - if storage_scope and storage_scope != "global": - with tvm.transform.PassContext(): - input_buffer = tvm.tir.decl_buffer( - packed_shape, name="Xb", dtype=dtype, scope=storage_scope - ) - output_buffer = tvm.tir.decl_buffer( - output_shape2, name="Yb", dtype=dtype, scope=storage_scope - ) - binds = {logical_input: input_buffer, packed_output: output_buffer} - - return (s, [logical_input, filt_packed1, filt_packed2, packed_output], binds) - - -class BaseConv2dConv2d: - """Base class for conv2d-conv2d tests""" - - # input - batch = tvm.testing.parameter(1) - in_size = tvm.testing.parameter(64) - in_channel = tvm.testing.parameter(128) - # conv2d #1 - pad1 = tvm.testing.parameter(0) - stride1 = tvm.testing.parameter(1) - kernel_size1 = tvm.testing.parameter(1, 3) - out_channel1 = tvm.testing.parameter(128) - # conv2d #2 - stride2 = tvm.testing.parameter(1) - kernel_size2 = tvm.testing.parameter(1, 3) - out_channel2 = tvm.testing.parameter(128) - # schedule params - k_split_factor = tvm.testing.parameter(1, 2) - h_split_factor = tvm.testing.parameter(1, 2) - dtype = tvm.testing.parameter("float32") - - -class TestConv2dConv2dPackedFilter(BaseConv2dConv2d): - """Conv2d-Conv2d packed filter test class""" - - @tvm.testing.parametrize_targets("llvm") - @tvm.testing.skip_if_32bit(reason="Test known to be flaky on i386 machines") - def test_conv2d( - self, - batch, - in_size, - in_channel, - pad1, - stride1, - kernel_size1, - out_channel1, - stride2, - kernel_size2, - out_channel2, - k_split_factor, - h_split_factor, - dtype, - target, - ): - """conv2d-conv2d test""" - # TODO: no support for padding in conv2d #2 - pad2 = 0 - - # TODO: no support for dilation - dilation1 = 1 - dilation2 = 1 - - shape_input = [batch, in_size, in_size, in_channel] - shape_filter1_oihw = [out_channel1, in_channel, kernel_size1, kernel_size1] - shape_filter1_oihw8i32o4i = get_packed_filter_shape(shape_filter1_oihw) - - shape_filter2_oihw = [out_channel2, out_channel1, kernel_size2, kernel_size2] - shape_filter2_oihw8i32o4i = get_packed_filter_shape(shape_filter2_oihw) - - inputs = [ - np.random.uniform(0, 255, size=shape_input).astype(dtype), - np.random.uniform(0, 255, size=shape_filter1_oihw8i32o4i).astype(dtype), - np.random.uniform(0, 255, size=shape_filter2_oihw8i32o4i).astype(dtype), - ] - np_filter1 = ( - inputs[1] - .transpose(0, 5, 1, 4, 6, 2, 3) - .reshape(shape_filter1_oihw) - .transpose(2, 3, 1, 0) - ) - np_filter2 = ( - inputs[2] - .transpose(0, 5, 1, 4, 6, 2, 3) - .reshape(shape_filter2_oihw) - .transpose(2, 3, 1, 0) - ) - temp_output = testing.conv2d_nhwc_python(inputs[0], np_filter1, stride1, pad1) - ref_output = testing.conv2d_nhwc_python(temp_output, np_filter2, stride2, pad2) - output = build_and_run( - inputs, - conv2dconv2d_nhwc8h8w32c, - target, - target, - shape_input=shape_input, - pad1=(pad1, pad1, pad1, pad1), - stride1=(stride1, stride1), - dilation1=(dilation1, dilation1), - shape_filter1=shape_filter1_oihw8i32o4i, - pad2=(pad2, pad2, pad1, pad1), - stride2=(stride2, stride2), - dilation2=(dilation2, dilation2), - shape_filter2=shape_filter2_oihw8i32o4i, - k_split_factor=k_split_factor, - h_split_factor=h_split_factor, - dtype=dtype, - ) - - conv2d_verify(output, ref_output, dtype) - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py index d22b2db9c399..99fc6ac074c2 100644 --- a/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py +++ b/tests/python/contrib/test_hexagon/test_2d_physical_buffers.py @@ -200,12 +200,7 @@ def schedule_args( working_scope, ): """Create and return the schedule and input args after applying layout transform""" - if schedule_type == "TE": - - return self._te_schedule_args( - input_shape, dtype, input_layout, output_layout, working_layout, working_scope - ) - elif schedule_type == "TIR": + if schedule_type == "TIR": return self._tir_schedule_args( input_shape, dtype, input_layout, output_layout, working_layout, working_scope ) @@ -222,40 +217,6 @@ def _te_tensors(self, input_shape, dtype): ) return input_tensor, output_tensor - def _te_schedule_args( - self, - input_shape, - dtype, - input_layout, - output_layout, - working_layout, - working_scope, - ): - input_tensor, output_tensor = self._te_tensors(input_shape, dtype) - - schedule = te.create_schedule(output_tensor.op) - - write_cache = schedule.cache_write(output_tensor, working_scope) - read_cache = schedule.cache_read(input_tensor, working_scope, [write_cache]) - - def apply_transform(tensor, layout): - if layout == "nhwc": - return None - if layout == "nchw-8h8w32c-1d": - return schedule[tensor].transform_layout(layout_transform_1d) - if layout == "nchw-8h8w32c-2d": - return schedule[tensor].transform_layout(layout_transform_2d) - raise RuntimeError(f"Unexpected layout '{layout}'") - - apply_transform(input_tensor, input_layout) - compute_loopnest = apply_transform(output_tensor, output_layout) or output_tensor.op.axis - schedule[write_cache].compute_at(schedule[output_tensor], compute_loopnest[0]) - - apply_transform(read_cache, working_layout) - apply_transform(write_cache, working_layout) - - return [schedule, [input_tensor, output_tensor]] - def _tir_schedule_args( self, input_shape, dtype, input_layout, output_layout, working_layout, working_scope ): diff --git a/tests/python/contrib/test_hexagon/test_launcher.py b/tests/python/contrib/test_hexagon/test_launcher.py index 95c6c1e19805..c84e7a9d4a4c 100644 --- a/tests/python/contrib/test_hexagon/test_launcher.py +++ b/tests/python/contrib/test_hexagon/test_launcher.py @@ -39,11 +39,9 @@ def test_add(hexagon_session: Session): compute_c = tvm.te.compute( placeholder_a.shape, lambda i: placeholder_a[i] + placeholder_b[0], name="C" ) - sched = tvm.te.create_schedule(compute_c.op) func = tvm.build( - sched, - [placeholder_a, placeholder_b, compute_c], + te.create_prim_func([placeholder_a, placeholder_b, compute_c]), get_hexagon_target("v68"), name="add", ) @@ -69,11 +67,9 @@ def test_add_vtcm(hexagon_session: Session): compute_c = tvm.te.compute( placeholder_a.shape, lambda i: placeholder_a[i] + placeholder_b[0], name="C" ) - sched = tvm.te.create_schedule(compute_c.op) func = tvm.build( - sched, - [placeholder_a, placeholder_b, compute_c], + te.create_prim_func([placeholder_a, placeholder_b, compute_c]), get_hexagon_target("v68"), name="add", ) @@ -117,11 +113,9 @@ def test_matmul(self, hexagon_session, size_m, size_n, size_k): placeholder_x[i, reduce_k1] * placeholder_y[reduce_k1, j], axis=[reduce_k1] ), ) - schedule = te.create_schedule(compute_z.op) func = tvm.build( - schedule, - [placeholder_x, placeholder_y, compute_z], + te.create_prim_func([placeholder_x, placeholder_y, compute_z]), get_hexagon_target("v68"), ) diff --git a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py b/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py deleted file mode 100644 index 0cc6dbd8163f..000000000000 --- a/tests/python/contrib/test_hexagon/test_maxpool2d_blocked.py +++ /dev/null @@ -1,158 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Contrib tests for blocked conv2d and maxpool2d""" - -import numpy as np - -import tvm -import tvm.testing -from tvm import te, topi -from tvm.topi import testing - -from .infrastructure import build_and_run, get_block_shape, get_packed_shape - - -# Blocked layout: NHWC8h8w32c :: [N, H//8, W//8, C//32, 8h, 8w, 32c] -def maxpool2d_logical( - shape_nhwc, - window_shape, - stride, - padding, - dtype, - storage_scope="global", -): - """ - Maxpool2d TE wherein the input activation is defined by its - logical NHWC shape. The packed physical layout for the - activation is nhwc8h8w32c. - """ - - block_h, block_w, block_c = get_block_shape() - shape = get_packed_shape(shape_nhwc) - logical_output_shape = ( - shape_nhwc[0], - (shape_nhwc[1] - window_shape[0] + padding[0] + padding[1]) // stride[0] + 1, - (shape_nhwc[2] - window_shape[1] + padding[2] + padding[3]) // stride[0] + 1, - shape_nhwc[3], - ) - output_shape = get_packed_shape(logical_output_shape) - - _, height, width, _ = shape_nhwc - placeholder_x = te.placeholder(shape_nhwc, dtype=dtype) - - # Combination of padding required by maxpool operator and padding to evenly divisible - # number of blocks. Note that this padding should be inlined in the schedule so - # as to avoid input copying. - pad_h = (block_h - ((height + padding[1]) % block_h)) % block_h - pad_w = (block_w - ((width + padding[3]) % block_w)) % block_w - x_pad = topi.nn.pad( - placeholder_x, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0 - ) - - # Calculate packed layout - x_packed = te.compute( - shape, - lambda n, ho, wo, co, hi, wi, ci: x_pad[ - n, ho * block_h + hi, wo * block_w + wi, co * block_c + ci - ], - ) - - reduce_h = te.reduce_axis((0, window_shape[0]), name="rh") - reduce_w = te.reduce_axis((0, window_shape[1]), name="rw") - - def compute(batch, h_outer, w_outer, c_outer, h_inner, w_inner, c_inner): - # Construct blockized strided maxpool height indices - h = h_outer * block_h + h_inner - h_contig = h * stride[0] + reduce_h - h_block_id = h_contig // block_h - h_block_offset = h_contig % block_h - - # Construct blockized strided maxpool width indices - w_idx = w_outer * block_w + w_inner - w_contig = w_idx * stride[1] + reduce_w - w_block_id = w_contig // block_w - w_block_offset = w_contig % block_w - - return te.max( - x_packed[ - batch, h_block_id, w_block_id, c_outer, h_block_offset, w_block_offset, c_inner - ], - axis=[reduce_h, reduce_w], - ) - - compute_y = te.compute(output_shape, compute) - schedule = te.create_schedule(compute_y.op) - - # Ensure the padding and array packing is performed inline - schedule[x_pad].compute_inline() - schedule[x_packed].compute_inline() - - binds = {} - if storage_scope and storage_scope != "global": - with tvm.transform.PassContext(): - x_buffer = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope) - y_buffer = tvm.tir.decl_buffer( - output_shape, name="Yb", dtype=dtype, scope=storage_scope - ) - binds = {placeholder_x: x_buffer, compute_y: y_buffer} - - return (schedule, [placeholder_x, compute_y], binds) - - -class BaseMaxPooling: - batch = tvm.testing.parameter(1) - in_size = tvm.testing.parameter(8, 112) - in_channel = tvm.testing.parameter(64) - window_size = tvm.testing.parameter(3) - stride = tvm.testing.parameter(2) - pad = tvm.testing.parameter(1) - dtype = tvm.testing.parameter("float32") - - -class TestMaxPooling(BaseMaxPooling): - """Test MaxPool class""" - - @tvm.testing.parametrize_targets("llvm") - def test_maxpool(self, shape_nhwc, window_size, stride, pad, dtype, target): - """Test blocked maxpool""" - inputs = [np.random.uniform(0, 255, size=shape_nhwc).astype(dtype)] - ref_output = testing.poolnd_python( - inputs[0], - (window_size, window_size), - strides=(stride, stride), - dilation=(1, 1), - padding_before=(pad, pad), - padding_after=(pad, pad), - pool_type="max", - ) - output = build_and_run( - inputs, - maxpool2d_logical, - target, - target, - shape_nhwc, - window_shape=(window_size, window_size), - stride=(stride, stride), - padding=(pad, pad, pad, pad), - dtype=dtype, - ) - assert all([output is not None, ref_output is not None]) - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/contrib/test_hipblas.py b/tests/python/contrib/test_hipblas.py index 63a7553704bf..e5df51e62942 100644 --- a/tests/python/contrib/test_hipblas.py +++ b/tests/python/contrib/test_hipblas.py @@ -29,14 +29,13 @@ def verify_matmul_add(in_dtype, out_dtype, rtol=1e-5): A = te.placeholder((n, l), name="A", dtype=in_dtype) B = te.placeholder((l, m), name="B", dtype=in_dtype) C = hipblas.matmul(A, B, dtype=out_dtype) - s = te.create_schedule(C.op) def verify(target="rocm"): if not tvm.get_global_func("tvm.contrib.hipblas.matmul", True): print("skip because extern function is not available") return dev = tvm.rocm(0) - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) a = tvm.nd.array(np.random.uniform(0, 128, size=(n, l)).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(0, 128, size=(l, m)).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) @@ -56,10 +55,9 @@ def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5): A = te.placeholder(Ashape, name="A", dtype=in_dtype) B = te.placeholder(Bshape, name="B", dtype=in_dtype) C = hipblas.batch_matmul(A, B, dtype=out_dtype) - s = te.create_schedule(C.op) dev = tvm.rocm(0) - f = tvm.build(s, [A, B, C], "rocm") + f = tvm.build(te.create_prim_func([A, B, C]), target="rocm") if "int" in in_dtype: a = tvm.nd.array(np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev) diff --git a/tests/python/contrib/test_miopen.py b/tests/python/contrib/test_miopen.py deleted file mode 100644 index 81115b6c0238..000000000000 --- a/tests/python/contrib/test_miopen.py +++ /dev/null @@ -1,136 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -import tvm.testing -from tvm import te -from tvm.contrib import miopen -import numpy as np -import pytest - - -requires_miopen = pytest.mark.skipif( - tvm.get_global_func("tvm.contrib.miopen.conv2d.setup", True) is None, - reason="MIOpen is not enabled", -) - - -@tvm.testing.requires_rocm -@requires_miopen -def test_conv2d(): - in_channel = 3 - out_channel = 64 - filter_h = 3 - filter_w = 3 - pad_h = 1 - pad_w = 1 - stride_h = 1 - stride_w = 1 - dilation_h = 1 - dilation_w = 1 - - xshape = [1, in_channel, 128, 128] - wshape = (out_channel, in_channel, filter_h, filter_w) - - X = te.placeholder(xshape, name="X") - W = te.placeholder(wshape, name="W") - Y = miopen.conv2d_forward( - X, W, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, conv_mode=0, data_type=1 - ) - - yshape = [x.value for x in Y.shape] - from tvm import topi - - s = te.create_schedule(Y.op) - - def verify(): - dev = tvm.rocm(0) - f = tvm.build(s, [X, W, Y], "rocm --host=llvm", name="conv2d") - x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), dev) - w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), dev) - y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), dev) - f(x, w, y) - - Y_ref = topi.nn.conv2d_nchw( - X, W, (stride_h, stride_w), (pad_h, pad_w), (dilation_h, dilation_w) - ) - s_ref = te.create_schedule(Y_ref.op) - f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm --host=llvm") - y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), dev) - f_ref(x, w, y_ref) - print("Max abs diff:", np.max(np.abs(y.numpy() - y_ref.numpy()))) - tvm.testing.assert_allclose(y.numpy(), y_ref.numpy(), atol=1e-3) - - verify() - - -def verify_softmax(shape, axis, dtype="float32", log_softmax=False): - miopen_op = miopen.log_softmax if log_softmax else miopen.softmax - testing_op = ( - tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python - ) - - A = te.placeholder(shape, dtype=dtype, name="A") - B = miopen_op(A, axis) - s = te.create_schedule([B.op]) - - dev = tvm.rocm(0) - a_np = np.random.uniform(size=shape).astype(dtype) - b_np = testing_op(a_np) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - f = tvm.build(s, [A, B], target="rocm --host=llvm", name="softmax") - f(a, b) - tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3) - - -def verify_softmax_4d(shape, dtype="float32", log_softmax=False): - miopen_op = miopen.log_softmax if log_softmax else miopen.softmax - testing_op = ( - tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python - ) - - A = te.placeholder(shape, dtype=dtype, name="A") - B = miopen_op(A, axis=1) - s = te.create_schedule([B.op]) - - dev = tvm.rocm(0) - n, c, h, w = shape - a_np = np.random.uniform(size=shape).astype(dtype) - b_np = testing_op(a_np.transpose(0, 2, 3, 1).reshape(h * w, c)) - b_np = b_np.reshape(n, h, w, c).transpose(0, 3, 1, 2) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - f = tvm.build(s, [A, B], target="rocm --host=llvm", name="softmax") - f(a, b) - tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3) - - -@tvm.testing.requires_rocm -@requires_miopen -def test_softmax(): - verify_softmax((32, 10), -1) - verify_softmax((3, 4), -1) - verify_softmax_4d((1, 16, 256, 256)) - verify_softmax_4d((1, 16, 256, 256)) - - verify_softmax((32, 10), -1, log_softmax=True) - verify_softmax((3, 4), -1, log_softmax=True) - verify_softmax_4d((1, 16, 256, 256), log_softmax=True) - - -if __name__ == "__main__": - test_conv2d() diff --git a/tests/python/contrib/test_mps.py b/tests/python/contrib/test_mps.py index 92462e4c4f9e..e876672feaed 100644 --- a/tests/python/contrib/test_mps.py +++ b/tests/python/contrib/test_mps.py @@ -29,33 +29,20 @@ def test_matmul(): A = te.placeholder((n, l), name="A") B = te.placeholder((l, m), name="B") C = mps.matmul(A, B) - D = te.compute(C.shape, lambda *i: C(*i) + 1.0) - s = te.create_schedule(D.op) - yo, xo = D.op.axis - block_y = te.thread_axis("blockIdx.y") - block_x = te.thread_axis("blockIdx.x") - thread_y = te.thread_axis("threadIdx.y") - thread_x = te.thread_axis("threadIdx.x") - by, ty = s[D].split(yo, factor=16) - bx, tx = s[D].split(xo, factor=16) - s[D].bind(by, block_y) - s[D].bind(bx, block_x) - s[D].bind(ty, thread_y) - s[D].bind(tx, thread_x) - def verify(A, B, D, s, target="metal"): + def verify(A, B, C): if not tvm.get_global_func("tvm.contrib.mps.matmul", True): print("skip because extern function is not available") return dev = tvm.metal(0) - f = tvm.build(s, [A, B, D], "metal") + f = tvm.build(te.create_prim_func([A, B, C]), target="metal") a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) f(a, b, c) - tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()) + 1, rtol=1e-5) + tvm.testing.assert_allclose(c.numpy(), np.dot(a.numpy(), b.numpy()), rtol=1e-5) - verify(A, B, D, s) + verify(A, B, C) @tvm.testing.requires_metal @@ -71,20 +58,17 @@ def test_conv2d(): A = te.placeholder((n, h, w, ci), name="x") B = te.placeholder((co, kh, kw, ci), name="w") C = mps.conv2d(A, B, "SAME", 2) - s1 = te.create_schedule(C.op) def verify(A, B, C, target="llvm"): if not tvm.get_global_func("tvm.contrib.mps.conv2d", True): print("skip because extern function is not available") return dev = tvm.metal(0) - f = tvm.build(s1, [A, B, C], "metal") + f = tvm.build(te.create_prim_func([A, B, C]), target="metal") a = tvm.nd.array(np.random.uniform(size=(n, h, w, ci)).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=(co, kh, kw, ci)).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((n, h // stride, w // stride, co), dtype=C.dtype), dev) f(a, b, c) - # print(c.numpy()) - # print(c.shape) verify(A, B, C, s1) diff --git a/tests/python/contrib/test_random.py b/tests/python/contrib/test_random.py index 6ffd417a0a48..be9fed2c6ee8 100644 --- a/tests/python/contrib/test_random.py +++ b/tests/python/contrib/test_random.py @@ -30,7 +30,6 @@ def test_randint(): m = 10240 n = 10240 A = random.randint(-127, 128, size=(m, n), dtype="int32") - s = te.create_schedule(A.op) def verify(target="llvm"): if not tvm.testing.device_enabled(target): @@ -40,7 +39,7 @@ def verify(target="llvm"): print("skip because extern function is not available") return dev = tvm.cpu(0) - f = tvm.build(s, [A], target) + f = tvm.build(te.create_prim_func([A]), target=target) a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev) f(a) na = a.numpy() @@ -56,7 +55,6 @@ def test_uniform(): m = 10240 n = 10240 A = random.uniform(0, 1, size=(m, n)) - s = te.create_schedule(A.op) def verify(target="llvm"): if not tvm.testing.device_enabled(target): @@ -66,7 +64,7 @@ def verify(target="llvm"): print("skip because extern function is not available") return dev = tvm.cpu(0) - f = tvm.build(s, [A], target) + f = tvm.build(te.create_prim_func([A]), target=target) a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev) f(a) na = a.numpy() @@ -82,7 +80,6 @@ def test_normal(): m = 10240 n = 10240 A = random.normal(3, 4, size=(m, n)) - s = te.create_schedule(A.op) def verify(target="llvm"): if not tvm.testing.device_enabled(target): @@ -92,7 +89,7 @@ def verify(target="llvm"): print("skip because extern function is not available") return dev = tvm.cpu(0) - f = tvm.build(s, [A], target) + f = tvm.build(te.create_prim_func([A]), target=target) a = tvm.nd.array(np.zeros((m, n), dtype=A.dtype), dev) f(a) na = a.numpy() diff --git a/tests/python/contrib/test_rocblas.py b/tests/python/contrib/test_rocblas.py index c5321cd4eaaf..2c1889a0c43b 100644 --- a/tests/python/contrib/test_rocblas.py +++ b/tests/python/contrib/test_rocblas.py @@ -33,14 +33,13 @@ def test_matmul(): A = te.placeholder((n, l), name="A") B = te.placeholder((l, m), name="B") C = rocblas.matmul(A, B) - s = te.create_schedule(C.op) def verify(target="rocm"): if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True): print("skip because extern function is not available") return dev = tvm.rocm(0) - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) @@ -57,7 +56,6 @@ def verify_batch_matmul(batch, m, k, n, lib, transa=False, transb=False, dtype=" A = te.placeholder(ashape, name="A", dtype=dtype) B = te.placeholder(bshape, name="B", dtype=dtype) C = lib.batch_matmul(A, B, transa, transb) - s = te.create_schedule(C.op) def get_numpy(a, b, transa, transb): if transa: @@ -74,7 +72,7 @@ def verify(target="rocm"): print("skip because extern function is not available") return dev = tvm.rocm(0) - f = tvm.build(s, [A, B, C], target) + f = tvm.build(te.create_prim_func([A, B, C]), target=target) a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((batch, m, n), dtype=C.dtype), dev) diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py index c135450c09e1..0e0aa71caf10 100644 --- a/tests/python/contrib/test_sort.py +++ b/tests/python/contrib/test_sort.py @@ -20,7 +20,6 @@ import tvm import tvm.testing from tvm import te -from tvm.topi.cuda import sort_by_key def test_sort(): @@ -53,8 +52,7 @@ def test_sort(): dev = tvm.cpu(0) target = "llvm" - s = te.create_schedule(out.op) - f = tvm.build(s, [data, sort_num, out], target) + f = tvm.build(te.create_prim_func([data, sort_num, out]), target=target) a = tvm.nd.array(np.array(input_data).astype(data.dtype), dev) b = tvm.nd.array(np.array(sort_num_input).astype(sort_num.dtype), dev) c = tvm.nd.array(np.zeros(a.shape, dtype=out.dtype), dev) @@ -82,8 +80,7 @@ def test_sort_np(): dev = tvm.cpu(0) target = "llvm" - s = te.create_schedule(out.op) - f = tvm.build(s, [data, sort_num, out], target) + f = tvm.build(te.create_prim_func([data, sort_num, out]), target=target) np_data = np.random.uniform(size=dshape) np_out = np.argsort(np_data, axis=axis) @@ -95,40 +92,6 @@ def test_sort_np(): tvm.testing.assert_allclose(c.numpy(), np_out, rtol=1e-5) -def test_sort_by_key_gpu(): - """Tests sort function using gpu""" - size = 6 - keys = te.placeholder((size,), name="keys", dtype="int32") - values = te.placeholder((size,), name="values", dtype="int32") - - for target in ["cuda", "nvptx", "opencl", "rocm"]: - if not tvm.testing.device_enabled(target): - print("Skip because %s is not enabled" % target) - continue - - with tvm.target.Target(target): - keys_out, values_out = sort_by_key(keys, values) - dev = tvm.device(target) - s = te.create_schedule([keys_out.op, values_out.op]) - f = tvm.build(s, [keys, values, keys_out, values_out], target) - - keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32) - values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32) - keys_np_out = np.zeros(keys_np.shape, np.int32) - values_np_out = np.zeros(values_np.shape, np.int32) - keys_in = tvm.nd.array(keys_np, dev) - values_in = tvm.nd.array(values_np, dev) - keys_out = tvm.nd.array(keys_np_out, dev) - values_out = tvm.nd.array(values_np_out, dev) - f(keys_in, values_in, keys_out, values_out) - - ref_keys_out = np.sort(keys_np) - ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)]) - tvm.testing.assert_allclose(keys_out.numpy(), ref_keys_out, rtol=1e-5) - tvm.testing.assert_allclose(values_out.numpy(), ref_values_out, rtol=1e-5) - - if __name__ == "__main__": test_sort() test_sort_np() - test_sort_by_key_gpu() diff --git a/tests/python/contrib/test_sparse.py b/tests/python/contrib/test_sparse.py deleted file mode 100644 index 8ebd02cc170c..000000000000 --- a/tests/python/contrib/test_sparse.py +++ /dev/null @@ -1,123 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Configure pytest""" -# pylint: disable=invalid-name -from collections import namedtuple -import numpy as np -import tvm -import tvm.testing -from tvm import te -import tvm.contrib.sparse as tvmsp -import tvm.runtime.ndarray as _nd - - -def test_static_tensor(): - """Tests static tensor""" - dtype = "float32" - target = "llvm" - dev = tvm.device(target, 0) - m = te.size_var("m") - n = te.size_var("n") - A = tvmsp.placeholder(shape=(m, n), name="A", dtype=dtype) - assert A.stype == "csr" - n = 3 - a = np.maximum(np.random.uniform(size=(n, n)).astype(dtype) - 0.6, 0.0) - a = tvmsp.array(a, dev) - A.data = te.placeholder(a.data.shape, dtype, name="A_data") - Ab = tvm.tir.decl_buffer(a.data.shape, dtype, name="A_data") - binds = {A.data: Ab} - C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter") - s = te.create_schedule(C.op) - f = tvm.build(s, [A.data, C], target, binds=binds) - c = tvmsp.array(np.zeros((n, n), dtype), dev) - c.data = tvm.nd.empty(a.data.shape, dtype) - c.indices = a.indices - c.indptr = a.indptr - f(a.data, c.data) - tvm.testing.assert_allclose(c.numpy(), a.numpy() * 2.0, rtol=1e-5) - - -def test_dynamic_tensor(): - """Tests dynamic tensor""" - dtype = "float32" - target = "llvm" - dev = tvm.device(target, 0) - nr, nc, n = te.size_var("nr"), te.size_var("nc"), te.size_var("n") - A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name="A", dtype=dtype) - assert A.stype == "csr" - C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter") - s = te.create_schedule(C.op) - _nr, _nc = 3, 5 - a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype) - 0.6, 0.0) - a = tvmsp.array(a, dev) - assert a.data.dtype == a.dtype - Ab = namedtuple("CSRBuffer", ["data", "indices", "indptr"]) - Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_data") - Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_indices") - binds = {A.data: Ab.data, A.indices: Ab.indices} - f = tvm.build(s, [nr, A.data, C], target, binds=binds) - c = tvmsp.array(np.zeros((_nr, _nc), dtype), dev) - c.data = tvm.nd.empty(a.data.shape, dtype) - c.indices = a.indices - c.indptr = a.indptr - f(a.data.shape[0], a.data, c.data) - tvm.testing.assert_allclose(c.numpy(), a.numpy() * 2.0, rtol=1e-5) - - -def test_sparse_array_tuple(): - """Tests array when it is sparse""" - dtype, itype = "float32", "int32" - target = "llvm" - dev = tvm.device(target, 0) - nr, nc, n = te.size_var("nr"), te.size_var("nc"), te.size_var("n") - A = tvmsp.placeholder(shape=(nr, nc), nonzeros=n, name="A", dtype=dtype) - assert A.stype == "csr" - C = te.compute(A.data.shape, lambda i: A.data[i] * 2.0, tag="cs_scatter") - s = te.create_schedule(C.op) - _nr, _nc = 3, 5 - a = np.maximum(np.random.uniform(size=(_nr, _nc)).astype(dtype) - 0.6, 0.0) - # convert to sparse array tuple - source_array = a - ridx, cidx = np.nonzero(source_array) - data = source_array[ridx, cidx] - a_data = _nd.array(data, dev) - indices = np.nonzero(source_array)[1].astype(itype) - a_indices = _nd.array(indices, dev) - indptr = [0] + np.apply_along_axis(np.count_nonzero, axis=1, arr=source_array).tolist() - indptr = np.cumsum(np.array(indptr, itype)).astype(itype) - a_indptr = _nd.array(indptr, dev) - a_init = (a_data, a_indices, a_indptr) - # construct tvm sparse array with tuple - a = tvmsp.array(a_init, shape=source_array.shape, device=dev) - assert a.data.dtype == a.dtype - Ab = namedtuple("CSRBuffer", ["data", "indices", "indptr"]) - Ab.data = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_data") - Ab.indices = tvm.tir.decl_buffer(a.data.shape, a.data.dtype, name="A_indices") - binds = {A.data: Ab.data, A.indices: Ab.indices} - f = tvm.build(s, [nr, A.data, C], target, binds=binds) - c = tvmsp.array(np.zeros((_nr, _nc), dtype), dev) - c.data = tvm.nd.empty(a.data.shape, dtype) - c.indices = a.indices - c.indptr = a.indptr - f(a.data.shape[0], a.data, c.data) - tvm.testing.assert_allclose(c.numpy(), a.numpy() * 2.0, rtol=1e-5) - - -if __name__ == "__main__": - test_static_tensor() - test_dynamic_tensor() - test_sparse_array_tuple() diff --git a/tests/python/relax/test_frontend_from_fx.py b/tests/python/relax/test_frontend_from_fx.py index 3c932f86c582..446c4149fdde 100644 --- a/tests/python/relax/test_frontend_from_fx.py +++ b/tests/python/relax/test_frontend_from_fx.py @@ -3637,7 +3637,6 @@ def main( def test_stack(): - input_info = [ ([1, 3, 10, 10], "float32"), ([1, 3, 10, 10], "float32"), diff --git a/tests/python/runtime/test_runtime_dlpack.py b/tests/python/runtime/test_runtime_dlpack.py index cf12c89cdd51..60a86f662c6c 100644 --- a/tests/python/runtime/test_runtime_dlpack.py +++ b/tests/python/runtime/test_runtime_dlpack.py @@ -35,9 +35,7 @@ def test_from_dlpack_shape_one(): B = te.placeholder((rows, 16), name="B") C = te.compute(A.shape, lambda i, j: A[i, j] + B[i, j], name="C") - s = te.create_schedule(C.op) - - fadd = tvm.build(s, [A, B, C], tgt) + fadd = tvm.build(te.create_prim_func([A, B, C]), target=tgt) dev = tvm.device(tgt.kind.name, 0) diff --git a/tests/python/runtime/test_runtime_measure.py b/tests/python/runtime/test_runtime_measure.py index 8955b03241a2..4b39cef18bc5 100644 --- a/tests/python/runtime/test_runtime_measure.py +++ b/tests/python/runtime/test_runtime_measure.py @@ -35,8 +35,7 @@ def my_debug(filename): fout.write("c") X = te.compute((), lambda: tvm.tir.call_packed("my_debug", filename)) - s = te.create_schedule(X.op) - func = tvm.build(s, [X]) + func = tvm.build(te.create_prim_func([X])) x = tvm.nd.empty((), dtype="int32") ftimer = func.time_evaluator(func.entry_name, tvm.cpu(), number=1, repeat=1) diff --git a/tests/python/runtime/test_runtime_module_export.py b/tests/python/runtime/test_runtime_module_export.py index a6554f3a4f75..1dff6c42502e 100644 --- a/tests/python/runtime/test_runtime_module_export.py +++ b/tests/python/runtime/test_runtime_module_export.py @@ -17,211 +17,10 @@ import tvm import tvm.testing -import pytest from tvm.contrib import utils -import os -header_file_dir_path = utils.tempdir() - -def gen_engine_header(): - code = r""" - #ifndef _ENGINE_H_ - #define _ENGINE_H_ - #include - #include - #include - #include - class Engine { - }; - - #endif - """ - header_file = header_file_dir_path.relpath("gcc_engine.h") - with open(header_file, "w") as f: - f.write(code) - - -def generate_engine_module(): - code = r""" - #include - #include - #include "gcc_engine.h" - - extern "C" void gcc_1_(float* gcc_input4, float* gcc_input5, - float* gcc_input6, float* gcc_input7, float* out) { - Engine engine; - } - """ - import tvm.runtime._ffi_api - - gen_engine_header() - csource_module = tvm.runtime._ffi_api.CSourceModuleCreate(code, "cc", [], None) - return csource_module - - -@pytest.mark.skip("LEGACY-TEST: test to be replaced by relax") -@tvm.testing.uses_gpu -def test_mod_export(): - def verify_gpu_mod_export(obj_format): - for device in ["llvm", "cuda"]: - if not tvm.testing.device_enabled(device): - print("skip because %s is not enabled..." % device) - return - - synthetic_mod, synthetic_params = relay.testing.synthetic.get_workload() - synthetic_llvm_mod, synthetic_llvm_params = relay.testing.synthetic.get_workload() - with tvm.transform.PassContext(opt_level=3): - _, synthetic_gpu_lib, _ = relay.build_module.build( - synthetic_mod, "cuda", params=synthetic_params, mod_name="cudalib" - ) - _, synthetic_llvm_cpu_lib, _ = relay.build_module.build( - synthetic_llvm_mod, "llvm", params=synthetic_llvm_params, mod_name="llvmlib" - ) - - temp = utils.tempdir() - if obj_format == ".so": - file_name = "deploy_lib.so" - else: - assert obj_format == ".tar" - file_name = "deploy_lib.tar" - path_lib = temp.relpath(file_name) - synthetic_gpu_lib.import_module(synthetic_llvm_cpu_lib) - synthetic_gpu_lib.export_library(path_lib) - loaded_lib = tvm.runtime.load_module(path_lib) - assert loaded_lib.type_key == "library" - assert loaded_lib.imported_modules[0].type_key == "cuda" - # dso modules are merged together - assert len(loaded_lib.imported_modules) == 1 - - def verify_multi_dso_mod_export(obj_format): - for device in ["llvm"]: - if not tvm.testing.device_enabled(device): - print("skip because %s is not enabled..." % device) - return - - A = te.placeholder((1024,), name="A") - B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") - s = te.create_schedule(B.op) - mod0 = tvm.build(s, [A, B], "llvm", name="myadd0") - mod1 = tvm.build(s, [A, B], "llvm", name="myadd1") - - temp = utils.tempdir() - if obj_format == ".so": - file_name = "deploy_lib.so" - else: - assert obj_format == ".tar" - file_name = "deploy_lib.tar" - path_lib = temp.relpath(file_name) - - mod0.import_module(mod1) - mod0.export_library(path_lib) - loaded_lib = tvm.runtime.load_module(path_lib) - assert loaded_lib.type_key == "library" - # dso modules are merged - assert len(loaded_lib.imported_modules) == 0 - - def verify_json_import_dso(obj_format): - for device in ["llvm"]: - if not tvm.testing.device_enabled(device): - print("skip because %s is not enabled..." % device) - return - - # Get subgraph Json. - subgraph_json = ( - "json_rt_0\n" - + "input 0 10 10\n" - + "input 1 10 10\n" - + "input 2 10 10\n" - + "input 3 10 10\n" - + "add 4 inputs: 0 1 shape: 10 10\n" - + "sub 5 inputs: 4 2 shape: 10 10\n" - + "mul 6 inputs: 5 3 shape: 10 10\n" - + "json_rt_1\n" - + "input 0 10 10\n" - + "input 1 10 10\n" - + "input 2 10 10\n" - + "input 3 10 10\n" - + "add 4 inputs: 0 1 shape: 10 10\n" - + "sub 5 inputs: 4 2 shape: 10 10\n" - + "mul 6 inputs: 5 3 shape: 10 10" - ) - - temp = utils.tempdir() - subgraph_path = temp.relpath("subgraph.examplejson") - with open(subgraph_path, "w") as f: - f.write(subgraph_json) - - # Get Json and module. - A = te.placeholder((1024,), name="A") - B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") - s = te.create_schedule(B.op) - f = tvm.build(s, [A, B], "llvm", name="myadd") - try: - ext_lib = tvm.runtime.load_module(subgraph_path, "examplejson") - except: - print("skip because Loader of examplejson is not presented") - return - ext_lib.import_module(f) - if obj_format == ".so": - file_name = "deploy_lib.so" - else: - assert obj_format == ".tar" - file_name = "deploy_lib.tar" - path_lib = temp.relpath(file_name) - ext_lib.export_library(path_lib) - lib = tvm.runtime.load_module(path_lib) - assert lib.type_key == "examplejson" - assert lib.imported_modules[0].type_key == "library" - - def verify_multi_c_mod_export(): - from shutil import which - - if which("gcc") is None: - print("Skip test because gcc is not available.") - - for device in ["llvm"]: - if not tvm.testing.device_enabled(device): - print("skip because %s is not enabled..." % device) - return - - synthetic_mod, synthetic_params = relay.testing.synthetic.get_workload() - with tvm.transform.PassContext(opt_level=3): - _, synthetic_cpu_lib, _ = relay.build_module.build( - synthetic_mod, "llvm", params=synthetic_params - ) - - A = te.placeholder((1024,), name="A") - B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") - s = te.create_schedule(B.op) - f = tvm.build(s, [A, B], "c", name="myadd") - engine_module = generate_engine_module() - - temp = utils.tempdir() - file_name = "deploy_lib.so" - path_lib = temp.relpath(file_name) - synthetic_cpu_lib.import_module(f) - synthetic_cpu_lib.import_module(engine_module) - kwargs = {"options": ["-O2", "-std=c++17", "-I" + header_file_dir_path.relpath("")]} - work_dir = temp.relpath("work_dir") - os.mkdir(work_dir) - synthetic_cpu_lib.export_library(path_lib, fcompile=False, workspace_dir=work_dir, **kwargs) - assert os.path.exists(os.path.join(work_dir, "devc.o")) - loaded_lib = tvm.runtime.load_module(path_lib) - assert loaded_lib.type_key == "library" - # dso modules are merged - assert len(loaded_lib.imported_modules) == 0 - - for obj_format in [".so", ".tar"]: - verify_gpu_mod_export(obj_format) - verify_multi_dso_mod_export(obj_format) - verify_json_import_dso(obj_format) - - verify_multi_c_mod_export() - - -@pytest.mark.skip("LEGACY-TEST: test to be replaced by TensorIR") @tvm.testing.requires_llvm def test_import_static_library(): from tvm import te @@ -229,9 +28,15 @@ def test_import_static_library(): # Generate two LLVM modules. A = te.placeholder((1024,), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") - s = te.create_schedule(B.op) - mod0 = tvm.build(s, [A, B], "llvm", name="myadd0") - mod1 = tvm.build(s, [A, B], "llvm", name="myadd1") + irmod0 = tvm.IRModule.from_expr( + te.create_prim_func([A, B]).with_attr("global_symbol", "myadd0") + ) + irmod1 = tvm.IRModule.from_expr( + te.create_prim_func([A, B]).with_attr("global_symbol", "myadd1") + ) + + mod0 = tvm.build(irmod0, target="llvm") + mod1 = tvm.build(irmod1, target="llvm") assert mod0.implements_function("myadd0") assert mod1.implements_function("myadd1") diff --git a/tests/python/runtime/test_runtime_module_load.py b/tests/python/runtime/test_runtime_module_load.py index 33bd281b045f..130a274c354b 100644 --- a/tests/python/runtime/test_runtime_module_load.py +++ b/tests/python/runtime/test_runtime_module_load.py @@ -101,12 +101,13 @@ def test_device_module_dump(): n = tvm.runtime.convert(1024) A = te.placeholder((n,), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") - s = te.create_schedule(B.op) + + sch = tvm.tir.Schedule(te.create_prim_func([A, B])) # create iter var and assign them tags. num_thread = 8 - bx, tx = s[B].split(B.op.axis[0], factor=num_thread) - s[B].bind(bx, te.thread_axis("blockIdx.x")) - s[B].bind(tx, te.thread_axis("threadIdx.x")) + bx, tx = sch.split(sch.get_loops("B")[0], factors=[None, num_thread]) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") def check_device(device): dev = tvm.device(device, 0) @@ -114,9 +115,7 @@ def check_device(device): print("Skip because %s is not enabled" % device) return temp = utils.tempdir() - name = "myadd_%s" % device - - f = tvm.build(s, [A, B], device, "llvm", name=name) + f = tvm.build(sch.mod, target=device) path_dso = temp.relpath("dev_lib.so") # test cross compiler function @@ -143,8 +142,7 @@ def check_stackvm(device): print("Skip because %s is not enabled" % device) return temp = utils.tempdir() - name = "myadd_%s" % device - f = tvm.build(s, [A, B], device, "stackvm", name=name) + f = tvm.build(sch.mod, target=tvm.target.Target(device, host="stackvm")) path_dso = temp.relpath("dev_lib.stackvm") f.export_library(path_dso) f1 = tvm.runtime.load_module(path_dso) diff --git a/tests/python/runtime/test_runtime_module_property.py b/tests/python/runtime/test_runtime_module_property.py index bd71e856d917..97c51ff93996 100644 --- a/tests/python/runtime/test_runtime_module_property.py +++ b/tests/python/runtime/test_runtime_module_property.py @@ -33,12 +33,7 @@ def create_csource_module(): def create_llvm_module(): A = te.placeholder((1024,), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") - s = te.create_schedule(B.op) - return tvm.build(s, [A, B], "llvm", name="myadd0") - - -def create_aot_module(): - return tvm.get_global_func("relay.build_module._AOTExecutorCodegen")() + return tvm.build(te.create_prim_func([A, B]), target="llvm") def test_property(): @@ -52,11 +47,6 @@ def test_property(): expected={"is_binary_serializable": False, "is_runnable": True, "is_dso_exportable": True}, ) - checker( - create_aot_module(), - expected={"is_binary_serializable": False, "is_runnable": True, "is_dso_exportable": False}, - ) - if __name__ == "__main__": tvm.testing.main() diff --git a/tests/python/runtime/test_runtime_rpc.py b/tests/python/runtime/test_runtime_rpc.py index 31cab2819df1..717cc8fffa05 100644 --- a/tests/python/runtime/test_runtime_rpc.py +++ b/tests/python/runtime/test_runtime_rpc.py @@ -73,8 +73,7 @@ def test_bigendian_rpc(): def verify_rpc(remote, target, shape, dtype): A = te.placeholder(shape, dtype=dtype) B = te.compute(A.shape, lambda i: A[i] + tvm.tir.const(1, A.dtype)) - s = te.create_schedule(B.op) - f = tvm.build(s, [A, B], target, name="myadd") + f = tvm.build(te.create_prim_func([A, B]), target=target) dev = remote.cpu(0) a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), device=dev) diff --git a/tests/python/runtime/test_runtime_trace.py b/tests/python/runtime/test_runtime_trace.py index 08f56b56c8c7..58d1a079e46b 100644 --- a/tests/python/runtime/test_runtime_trace.py +++ b/tests/python/runtime/test_runtime_trace.py @@ -23,8 +23,7 @@ def test_trace_default_action(): n = 2 x = te.placeholder((n, n, n), name="X", dtype="float32") y = te.compute(x.shape, lambda i, j, k: tvm.tir.trace([i, j, k, x[i][j][k]])) - s = te.create_schedule(y.op) - f = tvm.build(s, [x, y], target="llvm") + f = tvm.build(te.create_prim_func([x, y]), target="llvm") xnd = tvm.nd.array(np.ones((n, n, n), dtype=x.dtype)) ynd = tvm.nd.array(np.zeros((n, n, n), dtype=y.dtype)) f(xnd, ynd) @@ -44,8 +43,7 @@ def check_assign(dtype): z = te.compute( x.shape, lambda i, j, k: tvm.tir.trace([y[i][j][k]], "tvm.tir.trace_callback2") ) - s = te.create_schedule(z.op) - f = tvm.build(s, [x, y, z], "llvm") + f = tvm.build(te.create_prim_func([x, y, z]), "llvm") xnd = tvm.nd.array(np.ones((n, n, n), dtype=x.dtype)) ynd = tvm.nd.array(np.zeros((n, n, n), dtype=y.dtype)) @@ -74,8 +72,7 @@ def check_expr_sum(dtype): lambda i, j, k: tvm.tir.trace([a[i][j][k]], "tvm.tir.trace_callback3") + tvm.tir.trace([b[i][j][k]], "tvm.tir.trace_callback3"), ) - s = te.create_schedule(c.op) - f = tvm.build(s, [a, b, c]) + f = tvm.build(te.create_prim_func([a, b, c])) xnd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=a.dtype))) ynd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=b.dtype))) znd = tvm.nd.array(np.zeros((n, n, n), dtype=c.dtype)) @@ -105,8 +102,7 @@ def check_expr_sum(dtype): + tvm.tir.trace([i, j, k, d[i][j][k]], "tvm.tir.trace_silent") + tvm.tir.trace([i, j, k, e[i][j][k]], "tvm.tir.trace_silent"), ) - s = te.create_schedule(c.op) - f = tvm.build(s, [a, b, d, e, c]) + f = tvm.build(te.create_prim_func([a, b, d, e, c])) a_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=a.dtype))) b_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=b.dtype))) d_nd = tvm.nd.array(np.array(np.ones((n, n, n), dtype=d.dtype))) @@ -135,8 +131,7 @@ def check_expr_sum_custom(dtype): lambda i, j: tvm.tir.trace([a[i][j]], "tvm.tir.trace_callback4") + tvm.tir.trace([b[i][j]], "tvm.tir.trace_callback4"), ) - s = te.create_schedule(c.op) - f = tvm.build(s, [a, b, c]) + f = tvm.build(te.create_prim_func([a, b, c])) npa = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=a.dtype) npb = np.array([[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]], dtype=a.dtype) xnd = tvm.nd.array(npa) @@ -163,8 +158,7 @@ def check_assign(dtype): x = te.placeholder((n,), name="X", dtype=dtype) y = te.compute(x.shape, lambda i: tvm.tir.trace([x[i]], "tvm.tir.trace_change_int_first")) z = te.compute(x.shape, lambda i: tvm.tir.trace([y[i]], "tvm.tir.trace_change_int_second")) - s = te.create_schedule(z.op) - f = tvm.build(s, [x, y, z], "llvm") + f = tvm.build(te.create_prim_func([x, y, z])) xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype)) ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype)) @@ -195,8 +189,7 @@ def check_assign(dtype): z = te.compute( x.shape, lambda i: tvm.tir.trace([y[i]], "tvm.tir.trace_change_float_second") ) - s = te.create_schedule(z.op) - f = tvm.build(s, [x, y, z], "llvm") + f = tvm.build(te.create_prim_func([x, y, z]), target="llvm") xnd = tvm.nd.array(np.ones((n,), dtype=x.dtype)) ynd = tvm.nd.array(np.zeros((n,), dtype=y.dtype)) diff --git a/tests/python/target/test_target_target.py b/tests/python/target/test_target_target.py index cda228939f31..b99834aef35a 100644 --- a/tests/python/target/test_target_target.py +++ b/tests/python/target/test_target_target.py @@ -578,7 +578,7 @@ def func(): func = func.with_attr("Target", target) target2 = tvm.ir.load_json(tvm.ir.save_json(target)) mod = tvm.IRModule({"main": func}) - lib = tvm.build({target2: mod}, target_host=target) + lib = tvm.build(mod, target=target2) lib["func"]() diff --git a/tests/python/te/test_te_autodiff.py b/tests/python/te/test_te_autodiff.py deleted file mode 100644 index a5995ff0337f..000000000000 --- a/tests/python/te/test_te_autodiff.py +++ /dev/null @@ -1,351 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np -import pytest -import tvm -from tvm import te, topi -from tvm.testing import assert_allclose -from tvm.topi.utils import get_const_tuple - - -def check_grad( - out, inputs, args=[], data_range=(-10, 10), desired_grads=None, assert_no_jacobian=True -): - inputs = inputs if isinstance(inputs, list) else [inputs] - - def check_device(device, host="llvm"): - dev = tvm.device(device, 0) - if not tvm.testing.device_enabled(host): - return - - sout = te.create_schedule(out.op) - mout = tvm.build(sout, [out] + inputs + args) - out_shape = get_const_tuple(out.shape) - - l, h = data_range - input_data = [ - tvm.nd.array( - np.random.uniform(l, h, size=get_const_tuple(input.shape)).astype(input.dtype) - ) - for input in inputs - ] - arg_vals = [ - tvm.nd.array(np.random.uniform(l, h, size=get_const_tuple(arg.shape)).astype(arg.dtype)) - for arg in args - ] - - ones = topi.full_like(out, 1.0) - # we provide head to sum and reduce the output dimension, - # which equals to grad(out.sum(), inputs) - grads = te.gradient(out, inputs, head=ones) - grad_sched = te.create_schedule([grad.op for grad in grads]) - mgrad = tvm.build(grad_sched, list(grads) + inputs + args) - if assert_no_jacobian: - # TODO(yzhliu): it is better to visit the expression and do assertion - lowered_ir = str(tvm.lower(grad_sched, list(grads) + inputs + args, simple_mode=True)) - assert "jacobian" not in lowered_ir, lowered_ir - - grad_data = [tvm.nd.empty(get_const_tuple(i.shape), g.dtype) for i, g in zip(inputs, grads)] - - mgrad(*grad_data, *input_data, *arg_vals) - g_res = [g.numpy() for g in grad_data] - - if desired_grads: - assert isinstance(desired_grads, list) - for actual, desired in zip(g_res, desired_grads): - assert_allclose(actual, desired, rtol=0.1, atol=1e-2) - else: - - def forward(*in_data): - out_data = tvm.nd.empty(out_shape, out.dtype) - mout(out_data, *[tvm.nd.array(d) for d in list(in_data)]) - return out_data.numpy().sum() - - tvm.testing.check_numerical_grads( - forward, [d.numpy() for d in input_data + arg_vals], g_res - ) - - check_device("cpu") - - -def test_basic_operation(): - np.random.seed(0) - shape = (10, 10) - x = te.var("x", dtype="float32") - k = te.reduce_axis((0, 10), name="k") - l = te.reduce_axis((0, 10), name="l") - A0 = te.placeholder(shape, name="A0") - A1 = te.placeholder(shape, name="A1") - zeros = np.zeros(shape) - - B = te.compute(shape, lambda i, j: A0[i, j], name="B") - check_grad(B, [A0]) - - B = te.compute(shape, lambda i, j: A0[i, j] + A1[i, j], name="B") - check_grad(B, [A0, A1]) - - B = te.compute(shape, lambda i, j: A0[i, j] + A0[j, i], name="B") - check_grad(B, A0) - - B = te.compute(shape, lambda i, j: te.floor(A0[i, j]), name="B") - check_grad(B, A0, desired_grads=[zeros]) - - B = te.compute(shape, lambda i, j: te.ceil(A0[i, j]), name="B") - check_grad(B, A0, desired_grads=[zeros]) - - B = te.compute(shape, lambda i, j: te.trunc(A0[i, j]), name="B") - check_grad(B, A0, desired_grads=[zeros]) - - B = te.compute(shape, lambda i, j: te.round(A0[i, j]), name="B") - check_grad(B, A0, desired_grads=[zeros]) - - B = te.compute(shape, lambda i, j: A0[i, j] + te.exp(A0[j, i]), name="B") - check_grad(B, A0) - - B = te.compute(shape, lambda i, j: te.log(0.1 + te.abs(A0[i, j] + te.exp(A0[j, i]))), name="B") - check_grad(B, A0) - - B = te.compute(shape, lambda i, j: te.sigmoid(A0[i, j] * A0[i, j] * A0[j, i]), name="B") - check_grad(B, A0) - - B = te.compute(shape, lambda i, j: te.tanh(A0[i, j] * A0[i, j] * A0[j, i]), name="B") - check_grad(B, A0) - - B = te.compute(shape, lambda i, j: te.sqrt(A0[i, j] * A0[i, j] * A0[j, i]), name="B") - check_grad(B, A0, data_range=(0.1, 10)) - - B = te.compute(shape, lambda i, j: te.power(te.abs(A0[i, j]), A0[j, i]), name="B") - check_grad(B, A0, data_range=(-4, 4)) - - B = te.compute(shape, lambda i, j: A0[i, j] * A0[j, i], name="B") - check_grad(B, A0) - - B = te.compute((10,), lambda i: te.sum(A0[i, k] * A0[k, i], axis=k), name="B") - check_grad(B, A0) - - B = te.compute(shape, lambda i, j: te.sum(A0[i, k] * A0[k, i] + 5, axis=k), name="B") - check_grad(B, A0) - - B = te.compute(shape, lambda i, j: te.max(A0[i, k] * A0[k, j] + 5, axis=k), name="B") - check_grad(B, A0) - - B = te.compute(shape, lambda i, j: A0[i, j] * (A1[j, i] + A0[j, i]), name="B") - check_grad(B, [A0, A1]) - - B = te.compute( - shape, lambda i, j: te.sum(A0[k, k] - A0[te.min(j + k, 9), j] * A0[i, k], axis=k), name="B" - ) - check_grad(B, A0) - - def fcombine(x, y): - return x * y - - def fidentity(t0): - return tvm.tir.const(1, t0) - - prod = te.comm_reducer(fcombine, fidentity, name="prod") - B = te.compute((10, 10), lambda i, j: prod(A0[i, k] + A0[k, i], axis=k), name="B") - check_grad(B, A0) - - X = te.placeholder((10,), name="X") - A = te.compute((10,), lambda i: X[i] + X[9 - i]) - B = te.compute((10,), lambda i: X[i] * X[9 - i]) - Y = topi.tensordot(A, B, 1) - check_grad(Y, X) - - X = te.placeholder((3, 3), name="X") - Y = topi.einsum("ii->i", (X)) - check_grad(Y, X) - - -def test_topi(): - X = te.placeholder((1, 2, 4, 4), name="X") - W = te.placeholder((5, 2, 3, 3), name="W") - W1 = te.placeholder((2, 5, 3, 3), name="W1") - W2 = te.placeholder((1,), name="W2") - - R = topi.nn.conv2d(X, W, 1, 1, 1) - check_grad(R, [X, W]) - - R1 = topi.nn.conv2d(topi.nn.relu(R), W1, 1, 0, 1) - check_grad(R1, [X, W, W1]) - - R = topi.broadcast_to(W2, (5, 2, 3, 3)) - check_grad(R, [W2]) - - R = topi.nn.conv2d(X, topi.broadcast_to(W2, (5, 2, 3, 3)), 1, 1, 1) - check_grad(R, [X, W2]) - - R = topi.nn.pool2d(X, [2, 2], [1, 1], [2, 2], [0, 0, 0, 0], "avg") - check_grad(R, X) - - R = topi.nn.pool2d(X, [2, 2], [1, 1], [2, 2], [0, 0, 0, 0], "max") - check_grad(R, X) - - X = te.placeholder((1, 2, 5, 5), name="X") - R = topi.reshape(X, (1, 32)) - check_grad(R, [X]) - - X = te.placeholder((1, 2, 5, 5), name="X") - W = te.placeholder((2, 2, 3, 3), name="W") - - S = topi.reshape(X, (1, 50)) - check_grad(S, [X]) - - R = X + topi.nn.conv2d(X + topi.nn.conv2d(X, W, 1, 1, 1), W, 1, 1, 1) - check_grad(R, [X, W]) - - S = topi.nn.softmax(topi.reshape(R, (1, 50))) - check_grad(S, [X, W]) - - S = topi.sigmoid(topi.reshape(R, (1, 50))) - check_grad(S, [X, W]) - - S = topi.tanh(topi.reshape(R, (1, 50))) - check_grad(S, [X, W]) - - S = topi.nn.log_softmax(topi.reshape(R, (1, 50))) - check_grad(S, [X, W]) - check_grad(S, [W], [X]) - - X = te.placeholder((1, 2, 3, 5), name="X") - Y = te.placeholder((1, 2, 7, 5), name="Y") - S = topi.concatenate((X, Y), 2) - check_grad(S, [X, Y]) - - X = te.placeholder((1, 2, 6, 5), name="X") - (S, R) = topi.split(X, 2, 2) - check_grad(S, [X]) - check_grad(R, [X]) - R1 = topi.concatenate((S, R), 2) - check_grad(R1, [X]) - R2 = topi.concatenate((R, S), 2) - check_grad(R2, [X]) - - X = te.placeholder((4, 5), name="X") - I = te.placeholder((100,), name="I", dtype="int32") - R = topi.take(X, topi.abs(I)) - check_grad(R, [X], [I]) - - W = te.placeholder((5, 5), name="W") - exps = topi.exp(topi.nn.dense(X, W)) - sumexps = topi.sum(exps, axis=-1, keepdims=True) - R = exps / sumexps - check_grad(R, [X, W], data_range=(-1, 1)) - - -def test_stride_dilation(): - X = te.placeholder((1, 2, 10, 10), name="X") - W = te.placeholder((2, 2, 1, 1), name="W") - - Y = topi.nn.conv2d(X, W, 1, 0, 1) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 2, 0, 1) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 3, 0, 1) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 1, 0, 2) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 2, 0, 2) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 3, 0, 2) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 1, 0, 3) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 2, 0, 3) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 3, 0, 3) - check_grad(Y, [X, W]) - - W = te.placeholder((2, 2, 2, 2), name="W") - - Y = topi.nn.conv2d(X, W, 1, 0, 1) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 2, 0, 1) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 3, 0, 1) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 1, 0, 2) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 2, 0, 2) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 3, 0, 2) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 1, 0, 3) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 2, 0, 3) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 3, 0, 3) - check_grad(Y, [X, W]) - - W = te.placeholder((2, 2, 3, 3), name="W") - - Y = topi.nn.conv2d(X, W, 1, 0, 1) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 2, 0, 1) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 3, 0, 1) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 1, 0, 2) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 2, 0, 2) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 3, 0, 2) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 1, 0, 3) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 2, 0, 3) - check_grad(Y, [X, W]) - Y = topi.nn.conv2d(X, W, 3, 0, 3) - check_grad(Y, [X, W]) - - Y = topi.nn.pool2d(X, [1, 1], [1, 1], [1, 1], [0, 0, 0, 0], "max") - check_grad(Y, [X]) - Y = topi.nn.pool2d(X, [1, 1], [1, 1], [2, 2], [0, 0, 0, 0], "max") - check_grad(Y, [X]) - Y = topi.nn.pool2d(X, [1, 1], [1, 1], [3, 3], [0, 0, 0, 0], "max") - check_grad(Y, [X]) - Y = topi.nn.pool2d(X, [2, 2], [1, 1], [1, 1], [0, 0, 0, 0], "max") - check_grad(Y, [X]) - Y = topi.nn.pool2d(X, [2, 2], [1, 1], [2, 2], [0, 0, 0, 0], "max") - check_grad(Y, [X]) - Y = topi.nn.pool2d(X, [2, 2], [1, 1], [3, 3], [0, 0, 0, 0], "max") - check_grad(Y, [X]) - Y = topi.nn.pool2d(X, [3, 3], [1, 1], [1, 1], [0, 0, 0, 0], "max") - check_grad(Y, [X]) - Y = topi.nn.pool2d(X, [3, 3], [1, 1], [2, 2], [0, 0, 0, 0], "max") - check_grad(Y, [X]) - Y = topi.nn.pool2d(X, [3, 3], [1, 1], [3, 3], [0, 0, 0, 0], "max") - check_grad(Y, [X]) - - -@pytest.mark.xfail -def test_reduction_init(): - np.random.seed(0) - shape = (10, 10) - k = te.reduce_axis((0, 10), name="k") - A0 = te.placeholder(shape, name="A0") - - B = te.compute((10,), lambda i: te.sum(A0[i, k] * A0[k, i], axis=k, init=0.0), name="B") - check_grad(B, A0) - - -if __name__ == "__main__": - test_basic_operation() - test_topi() - test_stride_dilation() diff --git a/tests/python/te/test_te_build_lower.py b/tests/python/te/test_te_build_lower.py deleted file mode 100644 index 50d5119b43a0..000000000000 --- a/tests/python/te/test_te_build_lower.py +++ /dev/null @@ -1,65 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -from tvm import te - - -def test_lower_rfactor(): - n = te.size_var("n") - m = te.size_var("m") - A = te.placeholder((n, m), name="A") - k = te.reduce_axis((0, m), "k") - B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B") - s = te.create_schedule(B.op) - ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) - BF = s.rfactor(B, ki) - xo, xi = s[B].split(s[B].op.axis[0], factor=32) - s[B.op].bind(xo, te.thread_axis("blockIdx.x")) - s[B.op].bind(xi, te.thread_axis("threadIdx.y")) - s[B].bind(s[B].op.reduce_axis[0], te.thread_axis("threadIdx.x")) - s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) - fapi = tvm.lower(s, [A, B]) - - -def test_dependent_output_shape(): - n, m, x = te.size_var("n"), te.size_var("m"), te.size_var("x") - A = te.placeholder((n, m)) - B = te.compute((m, n // x), lambda i, j: A[i, j], name="B") - s = te.create_schedule(B.op) - mod = tvm.build(s, [A, B, x]) - - -def test_split_uneven_unique_likely(): - a = te.placeholder( - (16, 16), - ) - b = te.placeholder( - (16, 16), - ) - c = te.compute((16, 16), lambda x, y: a[x, y] + b[x, y]) - - x, y = c.op.axis - sch = te.create_schedule(c.op) - xo, xi = sch[c].split(x, 5) - stmt = tvm.lower(sch, [a, b, c])["main"].body - assert isinstance(stmt.body.body, tvm.tir.stmt.IfThenElse) - - -if __name__ == "__main__": - test_lower_rfactor() - test_dependent_output_shape() - test_split_uneven_unique_likely() diff --git a/tests/python/te/test_te_group.py b/tests/python/te/test_te_group.py deleted file mode 100644 index e57040abc085..000000000000 --- a/tests/python/te/test_te_group.py +++ /dev/null @@ -1,90 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Test group effect""" -import tvm -from tvm import te - - -def test_scan_group(): - m = te.size_var("m") - n = te.size_var("n") - x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") - s_state = te.placeholder((m, n)) - s_init = te.compute((1, n), lambda _, i: x[0, i]) - - s_update1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] + x[t, i]) - s_update2 = te.compute((m, n), lambda t, i: s_update1[t, i] + 1) - s_update3 = te.compute((m, n), lambda t, i: s_update2[t, i] + 1) - res = tvm.te.scan(s_init, s_update3, s_state, inputs=x) - - s = te.create_schedule(res.op) - assert s[s_update1].group is not None - assert s[s_update2].group == s[s_update1].group - # Assign within group, is valid - s[s_update1].compute_at(s[s_update2], s_update2.op.axis[1]) - # create a new group, for [s_update2 and s_update1] - g2 = s.create_group(outputs=s_update2, inputs=[s_state, x]) - assert g2.group is not None - assert g2.group == s[s_update3].group - assert s[s_update2].group == g2 - assert s[s_update1].group == g2 - g2.compute_at(s[s_update3], s_update3.op.axis[1]) - assert g2.attach_stage == s[s_update3] - try: - # compute outside group error. - s[s_update2].compute_at(s[s_init], s_init.op.axis[0]) - assert False - except tvm.error.TVMError: - pass - - -def test_compute_group(): - m = te.size_var("m") - n = te.size_var("n") - x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") - x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1") - x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2") - s = te.create_schedule(x2.op) - g = s.create_group(outputs=x1, inputs=x, include_inputs=True) - assert s[x1].group == g - assert s[x].group == g - g.compute_at(s[x2], x2.op.axis[1]) - assert g.attach_stage == s[x2] - assert g.num_child_stages == 2 - - -def test_nest_group(): - m = te.size_var("m") - n = te.size_var("n") - x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") - x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1") - x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2") - s = te.create_schedule(x2.op) - g1 = s.create_group(outputs=x1, inputs=x) - g2 = s.create_group(outputs=x1, inputs=x, include_inputs=True) - assert set(s.groups) == set([g1, g2]) - assert s[x].group == g2 - assert s[x1].group == g1 - assert g1.group == g2 - assert g2.num_child_stages == 2 - assert g1.num_child_stages == 1 - - -if __name__ == "__main__": - test_nest_group() - test_compute_group() - test_scan_group() diff --git a/tests/python/te/test_te_hybrid_script.py b/tests/python/te/test_te_hybrid_script.py deleted file mode 100644 index 862e80ffb6ce..000000000000 --- a/tests/python/te/test_te_hybrid_script.py +++ /dev/null @@ -1,872 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm, inspect, sys, traceback, numpy, pytest, types, os - -from tvm import te -from tvm.contrib import utils -from tvm.te.hybrid import script -from tvm.te.hybrid.runtime import HYBRID_GLOBALS - -import tvm.testing - - -@pytest.mark.skip -def run_and_check(func, args, var_dict={}, target="llvm", sch=None, outs=None): - def tvm_val_2_py_val(val): - val = tvm.tir.stmt_functor.substitute(val, var_dict) - val = tvm.arith.Analyzer().simplify(val) - assert isinstance(val, (tvm.tir.IntImm,)) - return val.value - - dev = tvm.device(target, 0) - op = None - - if sch is None: - outs = func(*tuple(tvm.runtime.convert(i) if isinstance(i, list) else i for i in args)) - op = outs[0].op if isinstance(outs, list) else outs.op - sch = te.create_schedule(op) - else: - assert outs is not None - assert isinstance(outs, list) - op = outs[0].op - - emu_args = [] - nd_args = [] - for i in args: - if isinstance(i, te.tensor.Tensor): - shape = [tvm_val_2_py_val(j) for j in i.shape] - emu_args.append(numpy.random.randn(*shape).astype(i.dtype)) - nd_args.append(tvm.nd.array(emu_args[-1], dev)) - elif isinstance(i, tvm.tir.Var): - emu_args.append(tvm_val_2_py_val(i)) - nd_args.append(emu_args[-1]) - else: - assert isinstance(i, list) - emu_args.append(numpy.array(i)) - - compile_args = [i for i in args if isinstance(i, (te.tensor.Tensor, tvm.tir.Var))] + ( - outs if isinstance(outs, list) else [outs] - ) - module = tvm.build(sch, compile_args, target=target) - assert module - - out_tensors = [] - for i in range(op.num_outputs): - output = op.output(i) - shape = [tvm_val_2_py_val(j) for j in output.shape] - nd_args.append(tvm.nd.array(numpy.zeros(shape).astype(output.dtype), dev)) - out_tensors.append(nd_args[-1]) - - ref_data = func(*emu_args) - if isinstance(ref_data, numpy.ndarray): - ref_data = [ref_data] - - module(*nd_args) - - for nd, np in zip(out_tensors, ref_data): - tvm.testing.assert_allclose(nd.numpy(), np, rtol=1e-5, atol=1e-5) - - module_args = [i for i in args if isinstance(i, (te.tensor.Tensor, tvm.tir.Var))] - module_outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs - h_module = te.hybrid.build(sch, module_args, module_outs) - - return h_module, module_args, module_outs - - -@script -def outer_product(n, m, a, b): - """This is a simple outer product. - Actually this function is not required to be documented. - I write this docstring to test skipping docstring functionality. - """ - c = output_tensor((n, m), a.dtype) - for i in range(n): - for j in range(m): - assert i < n and j < m, "index out of range!" - c[i, j] = a[i] * b[j] - return c - - -@tvm.testing.skip_if_wheel_test -# Test global function -# Test bridge between frontend and backend -def test_outer_product(): - n = te.size_var("n") - m = te.size_var("m") - a = te.placeholder((n,), name="a") - b = te.placeholder((m,), name="b") - - try: - c = outer_product(n, m, a, b) - ir = c.op.body - except IOError as err: - assert sys.version_info[0] == 2 and str(err) == "could not get source code" - return - - # Check for i in (0, n) - assert isinstance(ir, tvm.tir.For) - assert ir.loop_var.name == "i" - assert ir.min.value == 0 - assert ir.extent.name == "n" - ibody = ir.body - assert isinstance(ibody, tvm.tir.For) - # Check for j in (0, m) - assert ibody.loop_var.name == "j" - assert ibody.min.value == 0 - assert ibody.extent.name == "m" - # Check loop body - jblock = ibody.body - assert isinstance(jblock, tvm.tir.SeqStmt) - jbody = jblock[0] - assert isinstance(jbody, tvm.tir.AssertStmt) - assert isinstance(jbody.message, tvm.tir.StringImm) - assert jbody.message.value == "index out of range!" - jbody = jblock[1] - assert isinstance(jbody, tvm.tir.ProducerStore) - assert jbody.producer.op.name == "c" - assert len(jbody.indices) == 2 - assert jbody.indices[0].name == "i" - assert jbody.indices[1].name == "j" - assert isinstance(jbody.value, tvm.tir.Mul) - mul = jbody.value - assert isinstance(mul.a, tvm.tir.ProducerLoad) - assert mul.a.producer.name == "a" - assert mul.b.producer.name == "b" - - func, ins, outs = run_and_check(outer_product, [n, m, a, b], {n: 99, m: 101}) - temp = utils.tempdir() - path = temp.relpath("%s.py" % func.name) - func.save(path) - func_ = te.hybrid.HybridModule() - func_.load(path) - run_and_check(func_, ins, {n: 99, m: 101}, outs=outs) - - for key, _ in HYBRID_GLOBALS.items(): - assert key not in globals().keys() - assert key not in outer_product.__globals__.keys() - - -@tvm.testing.skip_if_wheel_test -# Test local function -# Test allocation of local variable -def test_fanout(): - @script - def fanout(n, a): - three = 3.0 - b = output_tensor((a.shape[0] - 3,), a.dtype) - for i in range(a.shape[0] - 3): - sigma = 0.0 - for j in range(3): - sigma += a[i + j] - sigma = sigma / three - b[i] = sigma - return b - - n = te.size_var("n") - a = te.placeholder((n,), "float32", name="a") - try: - b = fanout(n, a) - ir = b.op.body - except IOError as err: - assert sys.version_info[0] == 2 and str(err) == "could not get source code" - return - - # Check for i in (0, n-3) - assert isinstance(ir, tvm.tir.For) - assert ir.loop_var.name == "i" - assert ir.min.value == 0 - tvm.ir.assert_structural_equal(ir.extent, n - 3) - # Check loopbody - abody = ir.body - assert isinstance(abody, tvm.tir.ProducerRealize) - assert abody.bounds[0].min.value == 0 - assert abody.bounds[0].extent.value == 1 - assert abody.producer.op.name == "sigma" - # Check i loop body - rbody = abody.body - assert isinstance(rbody[0], tvm.tir.ProducerStore) - assert rbody[0].producer.op.name == "sigma" - assert len(rbody[0].indices) == 1 - assert rbody[0].indices[0].value == 0 - # Check fanout loop - jloop = rbody[1] - assert jloop.loop_var.name == "j" - assert jloop.min.value == 0 - assert jloop.extent.value == 3 - jbody = jloop.body - assert isinstance(jbody, tvm.tir.ProducerStore) - assert len(jbody.indices) == 1 - assert jbody.indices[0].value == 0 - assert jbody.producer.op.name == "sigma" - assert isinstance(jbody.value, tvm.tir.Add) - value = jbody.value - assert isinstance(value.a, tvm.tir.ProducerLoad) - assert value.a.producer.name == "sigma" - assert len(value.a.indices) == 1 - assert value.a.indices[0].value == 0 - assert value.b.producer.name == "a" - assert len(value.b.indices) == 1 - tvm.ir.assert_structural_equal(value.b.indices[0], ir.loop_var + jloop.loop_var) - divide = rbody[2] - assert isinstance(divide, tvm.tir.ProducerStore) - assert len(divide.indices) == 1 - assert divide.indices[0].value == 0 - value = divide.value - assert isinstance(value, tvm.tir.Mul) - assert value.a.producer.name == "sigma" - assert len(value.a.indices) == 1 - assert value.a.indices[0].value == 0 - assert abs(value.b.value - (1 / 3.0)) < 1e-5 - write = rbody[3] - assert isinstance(write, tvm.tir.ProducerStore) - assert write.producer.op.name == "b" - assert write.value.producer.name == "sigma" - assert len(write.value.indices) == 1 - assert write.value.indices[0].value == 0 - - func, ins, outs = run_and_check(fanout, [n, a], {n: 10}) - run_and_check(func, ins, {n: 10}, outs=outs) - - -def test_looptype(): - @script - def looptype(a, b, c): - d = output_tensor((16,), "int32") - e = output_tensor((16,), "int32") - f = output_tensor((16,), "int32") - for i in parallel(16): - d[i] = a[i] - for j in vectorize(16): - e[j] = b[j] - for k in unroll(16): - f[k] = c[k] - return d, e, f - - a = te.placeholder((16,), name="a", dtype="int32") - b = te.placeholder((16,), name="b", dtype="int32") - c = te.placeholder((16,), name="c", dtype="int32") - try: - d, e, f = looptype(a, b, c) - ir = d.op.body - except: - return - iloop = ir[0] - jloop = ir[1] - kloop = ir[2] - assert iloop.kind == tvm.tir.ForKind.PARALLEL - assert jloop.kind == tvm.tir.ForKind.VECTORIZED - assert kloop.kind == tvm.tir.ForKind.UNROLLED - - func, ins, outs = run_and_check(looptype, [a, b, c]) - run_and_check(func, ins, outs=outs) - - -@tvm.testing.skip_if_wheel_test -def test_if(): - @script - def if_then_else(a): - b = output_tensor((10,), "int32") - c = output_tensor((10,), "int32") - for i in range(10): - if i % 2 == 0: - c[i] = a[i] - else: - c[i] = b[i] - for i in unroll(10): - b[i] = -1 if i % 2 == 0 else 1 - return b, c - - a = te.placeholder((10,), dtype="int32", name="a") - - func, ins, outs = run_and_check(if_then_else, [a]) - run_and_check(func, ins, outs=outs) - - @script - def if_triple_condition(a): - b = output_tensor((10,), "int32") - for i in range(10): - if 0 <= i < 5: - b[i] = a[i] - else: - b[i] = a[i] + 1 - return b - - func, ins, outs = run_and_check(if_triple_condition, [a]) - run_and_check(func, ins, outs=outs) - - @script - def if_and(a): - b = output_tensor((10,), "int32") - for i in range(10): - if i >= 0 and i < 5: - b[i] = a[i] - else: - b[i] = a[i] + 1 - return b - - func, ins, outs = run_and_check(if_and, [a]) - run_and_check(func, ins, outs=outs) - - -@tvm.testing.requires_gpu -@tvm.testing.requires_cuda -def test_bind(): - @script - def vec_add(a, b): - c = output_tensor((1000,), "float32") - for tx in bind("threadIdx.x", 1000): - c[tx] = a[tx] + b[tx] - return c - - a = te.placeholder((1000,), dtype="float32", name="a") - b = te.placeholder((1000,), dtype="float32", name="b") - func, ins, outs = run_and_check(vec_add, [a, b], target="cuda") - run_and_check(func, ins, outs=outs, target="cuda") - - @script - def raw(a, b): - c = output_tensor((1000,), "float32") - for i in range(1000): - c[i] = a[i] + b[i] - return c - - c = raw(a, b) - sch = te.create_schedule(c.op) - x = te.thread_axis("threadIdx.x") - sch[c].bind(c.op.axis[0], x) - func, ins, outs = run_and_check(raw, [a, b], sch=sch, outs=[c], target="cuda") - run_and_check(func, ins, outs=outs, target="cuda") - - @te.hybrid.script - def foo(a): - c = output_tensor((a.shape[0],), a.dtype) - total = allocate((1,), a.dtype, "local") - len_i = a.shape[0] - len_j = a.shape[1] - for i in bind("threadIdx.x", len_i): - total[0] = 0.0 - for k in const_range(len_j): - total[0] += a[i, k] - c[i] = total[0] - - return c - - a = te.placeholder((8, 4), "float32") - c = foo(a) - s = te.create_schedule(c.op) - ir = tvm.lower(s, [a, c]) - - func, ins, outs = run_and_check(foo, [a], target="cuda") - run_and_check(func, ins, outs=outs, target="cuda") - - @te.hybrid.script - def max_threads(a): - b = output_tensor(a.shape, a.dtype) - n = a.shape[0] - m = max_num_threads(True) - for i in bind("threadIdx.x", m): - for j in bind("blockIdx.x", ceil_div(n, m)): - if i * m + j < n: - b[i * m + j] = a[i * m + j] + a[i * m + j] - return b - - a = te.placeholder((10000,), "float32") - with tvm.target.Target("cuda"): - func, ins, outs = run_and_check(max_threads, [a], target="cuda") - run_and_check(func, ins, outs=outs, target="cuda") - - -@tvm.testing.skip_if_wheel_test -def test_math_intrin(): - @script - def intrin_real(a): - b = output_tensor((8,), "float32") - b[0] = sqrt(a[0]) - b[1] = log(a[1]) - b[2] = exp(a[2]) - b[3] = sigmoid(a[3]) - b[4] = power(a[4], a[5]) - b[5] = tanh(a[5]) - b[6] = min(a[4], a[5]) - b[7] = max(a[5], a[6]) - return b - - a8 = te.placeholder((8,), dtype="float32", name="a") - b8 = intrin_real(a8) - sch = te.create_schedule(b8.op) - func = tvm.build(sch, [a8, b8]) - assert func - a = numpy.arange(2, 10).astype("float32") - tvm_a = tvm.nd.array(a) - tvm_b = tvm.nd.array(numpy.zeros((8,), dtype="float32")) - b = intrin_real(a) - func(tvm_a, tvm_b) - tvm.testing.assert_allclose(b, tvm_b.numpy(), rtol=1e-5) - - @script - def intrin_int(a): - b = output_tensor((1,), "int32") - b[0] = popcount(a[0]) - return b - - a1 = te.placeholder((1,), dtype="int32") - b1 = intrin_int(a1) - sch = te.create_schedule(b1.op) - func = tvm.build(sch, [a1, b1]) - assert func - a = numpy.array([114514]).astype("int32") - tvm_a = tvm.nd.array(a) - tvm_b = tvm.nd.array(numpy.array([0]).astype("int32")) - b = intrin_int(a) - func(tvm_a, tvm_b) - assert tvm_b.numpy()[0] == b[0] - - -@tvm.testing.skip_if_wheel_test -# test non caconical loops -def test_non_zero(): - @te.hybrid.script - def blur(a): - b = output_tensor((30, 30), "float32") - for i in range(2, 32): - for j in range(2, 32): - s = 0.0 - for di in range(3): - for dj in range(3): - s += a[i - di, j - dj] - b[i - 2, j - 2] = s / 9.0 - return b - - a = te.placeholder((32, 32), "float32", "a") - func, ins, outs = run_and_check(blur, [a]) - run_and_check(func, ins, outs=outs) - - @te.hybrid.script - def triangle(a, b): - c = output_tensor((10, 10), dtype="float32") - for i in range(10): - for j in range(i, 10): - c[i, j] = a[i] * b[j] - return c - - a = te.placeholder((10,), dtype="float32", name="a") - b = te.placeholder((10,), dtype="float32", name="b") - - func, ins, outs = run_and_check(triangle, [a, b]) - run_and_check(func, ins, outs=outs) - - -@tvm.testing.requires_gpu -@tvm.testing.requires_cuda -def test_allocate(): - @te.hybrid.script - def blur2d(a): - b = output_tensor((30, 30), "float32") - for i in range(30): - ha = allocate((3, 30), "float32") - for j in range(3): - for k in range(30): - ha[j, k] = a[i + j, k] + a[i + j, k + 1] + a[i + j, k + 2] - for j in range(30): - b[i, j] = (ha[0, j] + ha[1, j] + ha[2, j]) / 9.0 - return b - - a = te.placeholder((32, 32), "float32", "a") - b = blur2d(a) - sch = te.create_schedule(b.op) - func, ins, outs = run_and_check(blur2d, [a]) - run_and_check(func, ins, outs=outs) - - @te.hybrid.script - def share_vec_add(a, b): - c = output_tensor((256,), "float32") - shared = allocate((256,), "float32", "shared") - for i in bind("threadIdx.x", 256): - shared[i] = a[i] - local = allocate((256,), "float32", "local") - for i in bind("threadIdx.x", 256): - local[i] = b[i] - for i in bind("threadIdx.x", 256): - c[i] = shared[i] + local[i] - return c - - a = te.placeholder((256,), dtype="float32", name="a") - b = te.placeholder((256,), dtype="float32", name="b") - c = share_vec_add(a, b) - func, ins, outs = run_and_check(share_vec_add, [a, b], target="cuda") - run_and_check(func, ins, outs=outs, target="cuda") - - -@tvm.testing.skip_if_wheel_test -def test_upstream(): - @te.hybrid.script - def upstream(a): - b = output_tensor((20,), "float32") - for i in range(20): - b[i] = a[i] * i - return b - - a = te.placeholder((20,), "float32") - b = te.placeholder((20,), "float32") - c = te.compute((20,), lambda x: a[x] + b[x]) - d = upstream(c) - sch = te.create_schedule([c.op, d.op]) - ir = tvm.lower(sch, [a, b, d]) - func = tvm.build(sch, [a, b, d]) - assert func - - a = numpy.random.randn(20).astype("float32") - b = numpy.random.randn(20).astype("float32") - ref = numpy.zeros((20,), "float32") - for i in range(20): - ref[i] = (a[i] + b[i]) * i - - tvm_a = tvm.nd.array(a) - tvm_b = tvm.nd.array(b) - tvm_d = tvm.nd.array(numpy.zeros((20,)).astype("float32")) - - func(tvm_a, tvm_b, tvm_d) - tvm.testing.assert_allclose(tvm_d.numpy(), ref, 1e-5, 1e-5) - - -@tvm.testing.skip_if_wheel_test -def test_downstream(): - @te.hybrid.script - def downstream(a): - b = output_tensor((20,), "float32") - for i in range(20): - b[i] = a[i] * i - return b - - a = te.placeholder((20,), "float32") - b = downstream(a) - c = te.compute((20,), lambda x: b[x] + 1.0) - - sch = te.create_schedule(c.op) - module = tvm.build(sch, [a, c]) - assert module - - a = numpy.random.randn(20).astype("float32") - ref = numpy.zeros((20,)).astype("float32") - for i in range(20): - ref[i] = (a[i] * i) + 1.0 - - tvm_a = tvm.nd.array(a) - tvm_c = tvm.nd.array(numpy.zeros((20,)).astype("float32")) - module(tvm_a, tvm_c) - tvm.testing.assert_allclose(tvm_c.numpy(), ref, 1e-5, 1e-5) - - -@tvm.testing.skip_if_wheel_test -def test_const_param(): - @te.hybrid.script - def add_something(a, b): - c = output_tensor((11,), "int32") - for i in range(11): - c[i] = a[i] + b - return c - - a = te.placeholder((11,), dtype="int32", name="a") - b = tvm.tir.const(11, "int32") - c = add_something(a, b) - sch = te.create_schedule(c.op) - module = tvm.build(sch, [a, c], "llvm") - assert module - - np_a = numpy.arange(11).astype("int32") - np_b = 11 - np_c = numpy.zeros((11,)).astype("int32") - - nd_a = tvm.nd.array(np_a) - nd_c = tvm.nd.array(numpy.zeros((11,)).astype("int32")) - module(nd_a, nd_c) - ref = add_something(np_a, 11) - - tvm.testing.assert_allclose(nd_c.numpy(), ref, 1e-5, 1e-5) - - -@tvm.testing.skip_if_wheel_test -def test_value_index(): - @te.hybrid.script - def kernel_a(a): - b = output_tensor((16,), "int32") - c = output_tensor((4, 4), "int32") - for i in range(16): - b[i] = a[i] + 2 - c[i // 4, i % 4] = a[i] + 1 - return b, c - - @te.hybrid.script - def kernel_b(b, a): - c = output_tensor((4, 4), "int32") - for i in range(4): - for j in range(4): - c[i, j] = a[i * 4 + j] * b[i, j] - return c - - a = te.placeholder((16,), "int32") - b, c = kernel_a(a) - d = kernel_b(c, b) - sch = te.create_schedule(d.op) - module = tvm.build(sch, [a, d]) - assert module - - np_a = numpy.arange(16).astype("int32") - np_b, np_c = kernel_a(np_a) - ref = kernel_b(np_c, np_b) - - res = tvm.nd.array(numpy.zeros((4, 4)).astype("int32")) - module(tvm.nd.array(np_a), res) - tvm.testing.assert_allclose(res.numpy(), ref) - - -@tvm.testing.skip_if_wheel_test -def test_func_call(): - @te.hybrid.script - def foo(a, b): - for i in range(len(a)): - a[i] = i + 1.0 - for i in range(len(a)): - b[i] = i + 1.0 - c = outer_product(10, 10, a, b) - d = output_tensor(c.shape, c.dtype) - for i in range(10): - for j in range(10): - d[i, j] = c[i, j] + i * j - return d - - a = te.placeholder((10,), name="a") - b = te.placeholder((10,), name="b") - func, ins, outs = run_and_check(foo, [a, b]) - run_and_check(func, ins, outs=outs) - - -@tvm.testing.skip_if_wheel_test -def test_bool(): - @te.hybrid.script - def foo(a): - b = output_tensor(a.shape, a.dtype) - b[0] = 1.2 - for i in range(1, a.shape[0] - 1): - if a[i] * a[i - 1] < a[i] or a[i] * a[i - 1] < a[i - 1] or i * a[i] == a[i]: - b[i] = a[i] - else: - b[i] = 0.0 - return b - - a = te.placeholder((10,), name="a") - func, ins, outs = run_and_check(foo, [a]) - run_and_check(func, ins, outs=outs) - - -@tvm.testing.skip_if_wheel_test -def test_const_range(): - @te.hybrid.script - def foo(a, b): - c = output_tensor(a.shape, a.dtype) - d = output_tensor(a.shape, "int32") - - for i in const_range(2): - for j in const_range(5): - c[i, j] = float32(int32(a[i, j]) + b[i, j]) - - for i in const_range(len(b)): - for j in const_range(len(b[0])): - d[i, j] = int32(a[i, j] + b[i, j]) - - return c, d - - a = te.placeholder((2, 5), name="a", dtype="float32") - b = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]] - func, ins, outs = run_and_check(foo, [a, b]) - run_and_check(func, ins, outs=outs) - - @te.hybrid.script - def goo(a, b): - c = output_tensor(a.shape, a.dtype) - len_b = len(b) - for i in const_range(len_b * 2): - if i < len_b: - c[i] = a[i] + b[i] - else: - c[i - len_b] = a[i - len_b] + b[i - len_b] - return c - - a = te.placeholder((5,), name="a", dtype="int32") - b = [1, 2, 3, 4, 5] - c = goo(a, tvm.runtime.convert(b)) - sch = te.create_schedule(c.op) - func, ins, outs = run_and_check(goo, [a, b]) - run_and_check(func, ins, outs=outs) - - @te.hybrid.script - def hoo(a, b): - c = output_tensor(a.shape, a.dtype) - len_b = len(b) - for i in range(a.shape[0]): - for j in const_range(len(b)): - d = a[i] * b[j] - d += a[i] + b[j] - c[i] = d - return c - - a = te.placeholder((5,), name="a", dtype="int32") - b = [1, 2, 3, 4, 5] - func, ins, outs = run_and_check(hoo, [a, b]) - run_and_check(func, ins, outs=outs) - - -@tvm.testing.skip_if_wheel_test -def test_schedule(): - @script - def outer_product(a, b): - c = output_tensor((64, 64), a.dtype) - for i in range(64): - for j in range(64): - c[i, j] = a[i] * b[j] - return c - - a = te.placeholder((64,), name="a", dtype="float32") - b = te.placeholder((64,), name="b", dtype="float32") - c = outer_product(a, b) - - # Test perfect loop split - # Test loop reorder - # Test loop annotation - sch = te.create_schedule(c.op) - i, j = c.op.axis - io, ii = sch[c].split(i, 4) - sch[c].parallel(ii) - jo, ji = sch[c].split(j, 4) - joo, joi = sch[c].split(jo, 4) - sch[c].vectorize(ji) - sch[c].reorder(ii, io, joo, joi, ji) - ir = tvm.lower(sch, [a, b, c])["main"].body - assert isinstance(ir, tvm.tir.AttrStmt) - ir = ir.body - assert isinstance(ir, tvm.tir.For) - assert ir.loop_var.name == "i.inner" - ir = ir.body - assert isinstance(ir, tvm.tir.For) - assert ir.loop_var.name == "i.outer" - ir = ir.body - assert isinstance(ir, tvm.tir.For) - assert ir.loop_var.name == "j.outer.outer" - ir = ir.body - assert isinstance(ir, tvm.tir.For) - assert ir.loop_var.name == "j.outer.inner" - ir = ir.body - func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c]) - run_and_check(func, ins, outs=outs) - - # Test fuse - sch = te.create_schedule(c.op) - sch[c].fuse(c.op.axis[0], c.op.axis[1]) - ir = tvm.lower(sch, [a, b, c])["main"].body - assert isinstance(ir, tvm.tir.AttrStmt) - ir = ir.body - assert isinstance(ir, tvm.tir.For) - assert ir.loop_var.name == "i.j.fused" - func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c]) - run_and_check(func, ins, outs=outs) - - # Test imperfect loop split - sch = te.create_schedule(c.op) - sch[c].split(c.op.axis[0], 3) - ir = tvm.lower(sch, [a, b, c], simple_mode=True) - func, ins, outs = run_and_check(outer_product, [a, b], sch=sch, outs=[c]) - run_and_check(func, ins, outs=outs) - - # Test loop binds - - -@tvm.testing.skip_if_wheel_test -def test_capture(): - n = 8 - - constant_tuple = (10, n) - constant_list = [[1, 2], [3, n]] - const_value = 1 - - @te.hybrid.script - def add_something(a): - c = output_tensor((constant_tuple[1],), "int32") - for i in range(constant_tuple[1]): - c[i] = a[i] + constant_list[1][const_value] - return c - - a = te.placeholder((n,), dtype="int32", name="a") - - func, ins, outs = run_and_check(add_something, [a]) - run_and_check(func, ins, outs=outs) - - -@tvm.testing.skip_if_wheel_test -def test_array_inputs(): - @script - def sum_array(inputs): - out = output_tensor((10,), inputs[0].dtype) - n = len(inputs) - for i in range(10): - for j in const_range(n): - out[i] += inputs[j][i] - return out - - n = 5 - inputs = [] - for i in range(n): - inputs.append(te.placeholder((10,), name="t%s" % i, dtype="float32")) - - out = sum_array(tvm.runtime.convert(inputs)) - assert len(out.op.inputs) == n - - sch = te.create_schedule(out.op) - mod = tvm.build(sch, inputs + [out], target="llvm") - assert mod - - input_nd = [] - out_ref = numpy.zeros((10,)) - for _ in range(n): - arr = numpy.random.uniform(size=(10,)).astype("float32") - input_nd.append(tvm.nd.array(arr)) - out_ref += arr - out_nd = tvm.nd.array(numpy.zeros((10,), "float32")) - mod(*input_nd, out_nd) - tvm.testing.assert_allclose(out_nd.numpy(), out_ref) - - -if __name__ == "__main__": - test_outer_product() - test_fanout() - test_looptype() - test_if() - test_bind() - test_math_intrin() - test_non_zero() - test_allocate() - test_upstream() - test_downstream() - test_const_param() - test_value_index() - test_func_call() - test_bool() - test_const_range() - test_schedule() - test_capture() - test_array_inputs() - # TODO: - # test_inplace() diff --git a/tests/python/te/test_te_schedule.py b/tests/python/te/test_te_schedule.py deleted file mode 100644 index d46db2b702c0..000000000000 --- a/tests/python/te/test_te_schedule.py +++ /dev/null @@ -1,382 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import pickle as pkl - -import pytest -import tvm -from tvm import te -from tvm.driver.build_module import schedule_to_module - - -def test_schedule_create(): - m = te.size_var("m") - n = te.size_var("n") - l = te.size_var("l") - A = te.placeholder((m, l), name="A") - B = te.placeholder((n, l), name="B") - AA = te.compute((m, l), lambda i, j: A[i, j]) - T = te.compute((m, n, l), lambda i, j, k: AA(i, k) * B(j, k)) - s = te.create_schedule(T.op) - s[AA].set_scope("shared") - xo, xi = s[T].split(T.op.axis[0], factor=10) - xi1, xi2 = s[T].split(xi, factor=2) - s[AA].compute_at(s[T], xi1) - xo, xi = s[AA].split(AA.op.axis[0], factor=10) - s[T].reorder(xi2, xi1) - assert T.op.axis[1] in s[T].leaf_iter_vars - - # save load json - json_str = tvm.ir.save_json(s) - s_loaded = tvm.ir.load_json(json_str) - assert isinstance(s_loaded, tvm.te.schedule.Schedule) - assert str(s_loaded.outputs[0].body) == str(s.outputs[0].body) - - # pickle unpickle - dump = pkl.dumps(s) - s_loaded = pkl.loads(dump) - assert isinstance(s_loaded, tvm.te.schedule.Schedule) - assert str(s_loaded.outputs[0].body) == str(s.outputs[0].body) - - -def test_reorder(): - m = te.size_var("m") - A = te.placeholder((m,), name="A") - T = te.compute(m, lambda i: A[i + 1]) - - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=10) - xi1, xi2 = s[T].split(xi, factor=2) - order = (xi2, xi1, xo) - assert tuple(s[T].leaf_iter_vars) != order - s[T].reorder(*order) - assert tuple(s[T].leaf_iter_vars) == order - try: - # pass duplicate IterVar - # must raise an error - s[T].reorder(xi2, xi1, xi2) - assert False - except tvm.error.TVMError: - pass - - -def test_split(): - m = te.size_var("m") - A = te.placeholder((m,), name="A") - T = te.compute((m,), lambda i: A[i]) - - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=10) - assert tuple(s[T].leaf_iter_vars) == (xo, xi) - - -def test_tile(): - m = te.size_var("m") - n = te.size_var("n") - A = te.placeholder((m, n), name="A") - T = te.compute((m, n), lambda i, j: A[i, j]) - - s = te.create_schedule(T.op) - xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5) - assert tuple(s[T].leaf_iter_vars) == (xo, yo, xi, yi) - - -def test_fuse(): - m = te.size_var("m") - n = te.size_var("n") - A = te.placeholder((m, n), name="A") - T = te.compute((m, n), lambda i, j: A[i, j]) - - s = te.create_schedule(T.op) - xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5) - fused = s[T].fuse(xo, yo) - assert any(isinstance(x, tvm.te.schedule.Fuse) for x in s[T].relations) - assert tuple(s[T].leaf_iter_vars) == (fused, xi, yi) - - -def test_fuse_with_split(): - m = te.size_var("m") - n = te.size_var("n") - A = te.placeholder((m, n), name="A") - T = te.compute((m, n), lambda i, j: A[i, j]) - - s = te.create_schedule(T.op) - y = T.op.axis[1] - xo, xi = s[T].split(T.op.axis[0], factor=10) - fused = s[T].fuse(xi, y) - assert any(isinstance(x, tvm.te.schedule.Fuse) for x in s[T].relations) - assert tuple(s[T].leaf_iter_vars) == (xo, fused) - - -def test_fuse_with_out_of_order_axis(): - m = te.size_var("m") - n = te.size_var("n") - A = te.placeholder((m, n), name="A") - T = te.compute((m, n), lambda i, j: A[i, j]) - - s = te.create_schedule(T.op) - y = T.op.axis[1] - xo, xi = s[T].split(T.op.axis[0], factor=10) - - with pytest.raises(RuntimeError): - fused = s[T].fuse(xo, y) # should throw here - - -def test_fuse_with_out_of_order_axis_with_reorder(): - m = te.size_var("m") - n = te.size_var("n") - A = te.placeholder((m, n), name="A") - T = te.compute((m, n), lambda i, j: A[i, j]) - - s = te.create_schedule(T.op) - y = T.op.axis[1] - xo, xi = s[T].split(T.op.axis[0], factor=10) - s[T].reorder(y, xo, xi) - fused = s[T].fuse(y, xo) # should be ok - - s = te.create_schedule(T.op) - y = T.op.axis[1] - xo, xi = s[T].split(T.op.axis[0], factor=10) - s[T].reorder(y, xo, xi) - - with pytest.raises(RuntimeError): - fused = s[T].fuse(y, xi) # should throw here - - -def test_singleton(): - A = te.placeholder((), name="A") - T = te.compute((), lambda: A() + 1) - s = te.create_schedule(T.op) - fused = s[T].fuse() - assert any(isinstance(x, tvm.te.schedule.Singleton) for x in s[T].relations) - assert tuple(s[T].leaf_iter_vars) == (fused,) - dump = pkl.dumps(s) - s_loaded = pkl.loads(dump) - assert isinstance(s_loaded, tvm.te.schedule.Schedule) - - -def test_vectorize(): - m = te.size_var("m") - n = te.size_var("n") - A = te.placeholder((m, n), name="A") - T = te.compute((m, n), lambda i, j: A[i, j]) - - s = te.create_schedule(T.op) - xo, yo, xi, yi = s[T].tile(T.op.axis[0], T.op.axis[1], x_factor=10, y_factor=5) - s[T].vectorize(yi) - s[T].unroll(xi) - UNROLL = tvm.te.schedule.IterVar.Unrolled - VECTORIZE = tvm.te.schedule.IterVar.Vectorized - assert s[T].iter_var_attrs[xi].iter_type == UNROLL - assert s[T].iter_var_attrs[yi].iter_type == VECTORIZE - - -def test_vectorize_commreduce(): - V = te.placeholder((128,), name="V") - ax = te.reduce_axis((0, 128), name="ax") - O = te.compute((1,), lambda _: te.sum(V[ax], axis=[ax])) - s = te.create_schedule(O.op) - with pytest.raises(RuntimeError): - s[O].vectorize(ax) # should throw here - - -def test_pragma(): - m = 100 - A = te.placeholder((m,), name="A") - T = te.compute((m,), lambda i: A[i]) - - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=10) - s[T].pragma(xo, "pragma1") - s[T].pragma(xi, "vectorize") - VECTORIZE = tvm.te.schedule.IterVar.Vectorized - assert s[T].iter_var_attrs[xo].pragma_keys[0].value == "pragma1" - assert s[T].iter_var_attrs[xi].iter_type == VECTORIZE - - -def test_rfactor(): - n = te.size_var("n") - k1 = te.reduce_axis((0, n), name="k1") - k2 = te.reduce_axis((0, n), name="k2") - A = te.placeholder((n, n, n), name="A") - B = te.compute((n,), lambda i: te.sum(A[i, k1, k2], axis=[k1, k2])) - # normal schedule - s = te.create_schedule(B.op) - BF = s.rfactor(B, k1) - assert tuple(BF.shape) == (n, n) - assert set(BF.op.body[0].axis) == set([k2]) - assert s[B].op.body[0].axis[0].dom.extent == n - assert len(s[B].all_iter_vars) == 2 - # schedule with split - s = te.create_schedule(B.op) - ko, ki = s[B].split(k1, factor=4) - xo, xi = s[B].split(B.op.axis[0], factor=8) - BF = s.rfactor(B, ki) - assert BF.shape[0].value == 4 - assert BF.shape[1] == n - assert BF.op.body[0].axis[0] == k2 - assert BF.op.body[0].axis[1].var == ko.var - assert s[B].op.body[0].axis[0].dom.extent.value == 4 - # schedule with factor_axis - s = te.create_schedule(B.op) - ko, ki = s[B].split(k1, factor=4) - xo, xi = s[B].split(B.op.axis[0], factor=8) - BF = s.rfactor(B, ki, 1) - assert n == BF.shape[0] - assert BF.shape[1].value == 4 - assert BF.op.body[0].axis[0] == k2 - assert BF.op.body[0].axis[1].var == ko.var - assert s[B].op.body[0].axis[0].dom.extent.value == 4 - - -def test_tensor_intrin(): - n = 16 - x = te.placeholder((n,), name="x") - y = te.placeholder((n,), name="y") - z = te.compute(x.shape, lambda i: x[i] + y[i], name="z") - - def intrin_func(ins, outs): - assert isinstance(ins[0], tvm.te.schedule.Buffer) - assert ins[0].shape[0].value == n - return tvm.tir.call_packed("vadd", ins[0].data, outs[0].data, ins[0].shape[0]) - - intrin = te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params={"offset_factor": n}) - assert intrin.op == z.op - assert intrin.reduce_init is None - assert tuple(intrin.inputs) == tuple(z.op.input_tensors) - assert intrin.buffers[0].shape[0].value == n - m = 32 - X = te.placeholder((m,), name="X") - Y = te.placeholder((m,), name="Y") - Z = te.compute(X.shape, lambda i: X[i] + Y[i], name="Z") - s = te.create_schedule(Z.op) - xo, xi = s[Z].split(Z.op.axis[0], factor=n) - s[Z].tensorize(xi, intrin) - stmt = tvm.lower(s, [X, Y, Z])["main"].body - assert isinstance(stmt.body, tvm.tir.Evaluate) - assert str(stmt.body.value.args[0]) == '"vadd"' - assert str(stmt.body.value.args[1]) == "X" - assert str(stmt.body.value.args[2]) == "Z" - assert s[Z].iter_var_attrs[xi].tensor_intrin == intrin - assert s[Z].iter_var_attrs[xi].iter_type == tvm.te.schedule.IterVar.Tensorized - - -def test_tensor_intrin_scalar_params(): - n = te.size_var("n") - x = te.placeholder((n,), name="x") - v = te.size_var("v") - w = te.size_var("w") - z = te.compute((n,), lambda i: x[i] * v + w, name="z") - - def intrin_func(ins, outs, sp): - assert isinstance(ins[0], tvm.te.schedule.Buffer) - assert ins[0].shape[0] == n - assert sp[0] == v - assert sp[1] == w - return tvm.tir.call_packed("hw_func", ins[0].data, outs[0].data, sp[0], sp[1]) - - intrin = te.decl_tensor_intrin( - z.op, intrin_func, scalar_params=[v, w], default_buffer_params={"offset_factor": 1} - ) - assert intrin.op == z.op - assert intrin.reduce_init is None - assert tuple(intrin.inputs) == tuple(z.op.input_tensors) - assert intrin.buffers[0].shape[0] == n - assert tuple(intrin.scalar_params) == tuple((v, w)) - - A = te.placeholder((10, 10), name="A") - # Pass scalar inputs to the TensorIntrin, interleaved with tensor inputs - C = te.compute((10, 10), lambda i, j: intrin(i * i, A[i, j], i + j), name="C") - s = te.create_schedule(C.op) - stmt = tvm.lower(s, [A, C])["main"].body - assert isinstance(stmt.body.body, tvm.tir.Evaluate) - assert len(stmt.body.body.value.args) == 5 - assert str(stmt.body.body.value.args[3]) == "i * i" - assert str(stmt.body.body.value.args[4]) == "i + j" - - -def test_legalize_invalid_attach(): - A = te.compute((10, 10), lambda i, j: 1.0, name="A") - B = te.compute((10, 10), lambda i, j: A[i][j], name="B") - - # Case 1: Split an axis which is the target of a compute_at - s = te.create_schedule([B.op]) - s[A].compute_at(s[B], B.op.axis[1]) - s[B].split(B.op.axis[1], 2) - - stmt = tvm.lower(s, [A, B], simple_mode=True)["main"].body - assert isinstance(stmt.body.body, tvm.tir.stmt.For) - - # Case 2: Fuse an axis which is the target of a compute_at - s = te.create_schedule([B.op]) - s[A].compute_at(s[B], B.op.axis[1]) - s[B].fuse(B.op.axis[0], B.op.axis[1]) - stmt = tvm.lower(s, [A, B], simple_mode=True)["main"].body - assert isinstance(stmt, tvm.tir.stmt.For) - - -def test_compute_at(): - def add(): - shape = (16, 16) - A = tvm.te.compute(shape, lambda *i: 1.0, name="A") - B = tvm.te.compute(shape, lambda *i: 2.0, name="B") - C = tvm.te.compute(shape, lambda *i: A(*i) + B(*i), name="C") - return A, B, C - - def invalid_compute_at_self(): - A, B, C = add() - s = tvm.te.create_schedule(C.op) - s[C].compute_at(s[C], C.op.axis[0]) - with pytest.raises(RuntimeError): - tvm.lower(s, [A, B], simple_mode=True) - - def invalid_compute_at_loop(): - A, B, C = add() - s = tvm.te.create_schedule(C.op) - s[A].compute_at(s[C], C.op.axis[0]) - s[C].compute_at(s[A], A.op.axis[0]) - with pytest.raises(RuntimeError): - tvm.lower(s, [C], simple_mode=True) - - invalid_compute_at_self() - invalid_compute_at_loop() - - -@pytest.mark.parametrize("split_factor", [4, 4 * tvm.tir.vscale()]) -@pytest.mark.parametrize("disable_predication", [True, False]) -def test_split_disable_predicate(split_factor, disable_predication): - A = te.placeholder((43,), name="A") - B = te.compute(A.shape, lambda i: A[i] + 2, name="C") - - sch = te.create_schedule(B.op) - (i,) = sch[B].op.axis - _, _ = sch[B].split(i, factor=split_factor, disable_predication=disable_predication) - - mod = schedule_to_module(sch, [A, B], "main") - - predicates = [] - - def _find_predicates(stmt): - if isinstance(stmt, tvm.tir.stmt.IfThenElse): - predicates.append(stmt) - - tvm.tir.stmt_functor.post_order_visit(mod["main"].body, _find_predicates) - - assert bool(len(predicates)) != disable_predication - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/te/test_te_schedule_bound_inference.py b/tests/python/te/test_te_schedule_bound_inference.py deleted file mode 100644 index c246ee9f4109..000000000000 --- a/tests/python/te/test_te_schedule_bound_inference.py +++ /dev/null @@ -1,512 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -import tvm.testing -from tvm import te - - -def test_bound1(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - - s = te.create_schedule([A2.op]) - xo, xi = s[A2].split(s[A2].op.axis[0], 8) - s[A1].compute_at(s[A2], xo) - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - assert bounds[A1.op.axis[0]].extent.value == 8 - - -def test_bound2(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - s = te.create_schedule(A2.op) - xo, yo, xi, yi = s[A2].tile(A2.op.axis[0], A2.op.axis[1], 8, 8) - # test normalize not affecting schedule - _ = s.normalize() - s[A1].compute_at(s[A2], yo) - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - assert bounds[A1.op.axis[0]].extent.value == 8 - assert bounds[A1.op.axis[1]].extent.value == 8 - - -def test_bound3(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - - s = te.create_schedule(A2.op) - s[A1].set_scope("shared") - xo, xi = s[A2].split(A2.op.axis[0], 32) - xi0, xi1 = s[A2].split(xi, nparts=16) - s[A2].bind(xi0, te.thread_axis("threadIdx.x")) - yo, yi = s[A2].split(A2.op.axis[1], 16) - # test normalize not affecting schedule - _ = s.normalize() - s[A2].reorder(xo, xi0, yo, xi1, yi) - s[A1].compute_at(s[A2], yo) - - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - assert bounds[A1.op.axis[0]].extent.value == 32 - assert bounds[A1.op.axis[1]].extent.value == 16 - - -def test_bound_split_ext_less_than_factor(): - m = 8 - I = te.placeholder((m,), name="I") - EF = te.compute((m,), lambda i: I[i] * 2, name="EF") - E = te.compute((m,), lambda i: EF[i] * 2, name="E") - s = te.create_schedule([E.op]) - xo, xi = s[E].split(s[E].op.axis[0], factor=32) - s[EF].compute_at(s[E], xo) - - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - assert bounds[xi].extent.value == m - - -def test_bound_split_ext_less_than_naprts(): - m = 8 - I = te.placeholder((m,), name="I") - EF = te.compute((m,), lambda i: I[i] * 2, name="EF") - E = te.compute((m,), lambda i: EF[i] * 2, name="E") - s = te.create_schedule([E.op]) - xo, xi = s[E].split(s[E].op.axis[0], nparts=32) - s[EF].compute_at(s[E], xo) - - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - assert bounds[xo].extent.value == m - - -def test_bound_split_divisible(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((8 * m, l), name="A") - B = te.compute((8 * m, l), lambda i, j: A[i, j], name="B") - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], 8) - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - assert bounds[xo].extent == m - assert bounds[xi].extent.value == 8 - - -def test_bound_tile_divisible(): - m = te.var("m") - l = te.var("l") - shape = (8 * m, 32 * l) - A = te.placeholder(shape, name="A") - B = te.compute(shape, lambda i, j: A[i, j], name="B") - s = te.create_schedule(B.op) - xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], 8, 32) - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - assert bounds[xo].extent == m - assert bounds[xi].extent.value == 8 - assert bounds[yo].extent == l - assert bounds[yi].extent.value == 32 - - -def test_bound_fusesplit1(): - m = te.var("m") - l = te.var("l") - split1 = te.var("s") - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - - s = te.create_schedule(A2.op) - fused_axes = s[A2].fuse(A2.op.axis[0], A2.op.axis[1]) - xo, xi = s[A2].split(fused_axes, split1) - s[A1].compute_at(s[A2], xo) - - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - idxdiv = tvm.tir.indexdiv - tvm.testing.assert_prim_expr_equal(bounds[A1.op.axis[0]].min, idxdiv(xo * split1, l)) - - expected_extent = idxdiv((xo + 1) * split1 - 1, l) - idxdiv(xo * split1, l) + 1 - for i in range(1, 6): - for j in range(1, 6): - for k in range(1, 6): - vars = tvm.runtime.convert( - { - split1: tvm.tir.const(i, "int32"), - l: tvm.tir.const(j, "int32"), - xo.var: tvm.tir.const(k, "int32"), - } - ) - tvm.testing.assert_prim_expr_equal( - tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[0]].extent, vars), - tvm.tir.stmt_functor.substitute(expected_extent, vars), - ) - - tvm.testing.assert_prim_expr_equal(bounds[A1.op.axis[1]].extent, l) - - -def test_bound_fusesplit2(): - m = te.var("m") - l = tvm.runtime.convert(6) - split = tvm.runtime.convert(3) - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - - s = te.create_schedule(A2.op) - fused_axes = s[A2].fuse(A2.op.axis[0], A2.op.axis[1]) - xo, xi = s[A2].split(fused_axes, split) - s[A1].compute_at(s[A2], xo) - - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - vars = tvm.runtime.convert({xo.var: tvm.tir.const(5, "int32")}) - tvm.testing.assert_prim_expr_equal( - tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[0]].min, vars), 2 - ) - tvm.testing.assert_prim_expr_equal( - tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[1]].min, vars), 3 - ) - tvm.testing.assert_prim_expr_equal( - tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[0]].extent, vars), 1 - ) - tvm.testing.assert_prim_expr_equal( - tvm.tir.stmt_functor.substitute(bounds[A1.op.axis[1]].extent, vars), 3 - ) - - -def test_bound_warp(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - - s = te.create_schedule(A2.op) - s[A1].set_scope("warp") - xo, xi = s[A2].split(A2.op.axis[0], 32) - xi0, xi1 = s[A2].split(xi, factor=16) - tx = te.thread_axis("threadIdx.x") - s[A2].bind(xi1, tx) - s[A2].bind(xi0, te.thread_axis("threadIdx.y")) - y = s[A2].op.axis[1] - s[A1].compute_at(s[A2], y) - xo, xi = s[A1].split(s[A1].op.axis[0], factor=16) - s[A1].bind(xi, tx) - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - assert bounds[A1.op.axis[0]].extent.value == 16 - - -def test_bound_scan(): - m = te.var("m") - n = te.var("n") - X = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") - s_state = te.placeholder((m, n)) - s_init = te.compute((1, n), lambda _, i: X[0, i]) - s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i]) - s_scan = tvm.te.scan(s_init, s_update, s_state) - - assert tuple(s_scan.shape) == (m, n) - s = te.create_schedule(s_scan.op) - XX = s.cache_read(X, "local", s_update) - xo, xi = s[s_update].split(s_update.op.axis[1], factor=4) - s[XX].compute_at(s[s_update], xo) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - assert bounds[XX.op.axis[1]].extent.value == 4 - - -def test_bound_conv1d(): - n = te.var("n") - A = te.compute((n + 2), lambda i: 1, name="A") - - def computeB(ii): - i = ii + 1 - return A[i - 1] + A[i] + A[i + 1] - - B = te.compute(n, computeB, name="B") - s = te.create_schedule(B.op) - s[A].compute_at(s[B], B.op.axis[0]) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - assert bounds[A.op.axis[0]].extent.value == 3 - - -def test_bound_blur(): - n = tvm.runtime.convert(12) - A = te.compute((n, n), lambda i, j: 1, name="A") - - def computeB(ii, jj): - # set the correct center - i = ii + 1 - j = jj + 1 - return A[i][j] + A[i - 1][j] + A[i + 1][j] + A[i][j + 1] + A[i][j - 1] - - B = te.compute((n - 2, n - 2), computeB, name="B") - s = te.create_schedule(B.op) - s[A].compute_at(s[B], B.op.axis[1]) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - assert bounds[A.op.axis[0]].extent.value == 3 - assert bounds[A.op.axis[1]].extent.value == 3 - - -def test_bound_rfactor(): - n = te.var("n") - A = te.placeholder((n,), name="A") - k = te.reduce_axis((0, n)) - B = te.compute((1,), lambda i: te.sum(A[k], axis=k, where=(i > 1)), name="B") - # schedule - s = te.create_schedule(B.op) - kf, ki = s[B].split(k, nparts=4) - BF = s.rfactor(B, kf) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - - assert bounds[BF.op.axis[0]].extent.value == 4 - assert bounds[BF.op.axis[1]].extent.value == 1 - - -def test_bound_group_schedule(): - m = te.var("m") - n = te.var("n") - x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") - x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1") - x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2") - s = te.create_schedule(x2.op) - g = s.create_group(outputs=x1, inputs=x, include_inputs=True) - g.compute_at(s[x2], x2.op.axis[0]) - assert s[x1].group == g - assert s[x].group == g - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - assert bounds[x.op.axis[0]].extent.value == 1 - assert bounds[x.op.axis[1]].extent == n - - -def test_bound_nest_group(): - m = te.var("m") - n = te.var("n") - x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") - x1 = te.compute(x.shape, lambda *i: x(*i) + 1, name="x1") - x2 = te.compute(x.shape, lambda *i: x1(*i) + 2, name="x2") - s = te.create_schedule(x2.op) - g1 = s.create_group(outputs=x, inputs=x, include_inputs=True) - g2 = s.create_group(outputs=x1, inputs=x, include_inputs=True) - assert s[x].group == g1 - assert s[x1].group == g2 - g2.compute_at(s[x2], x2.op.axis[0]) - g1.compute_at(s[x1], s[x1].op.axis[1]) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - assert bounds[x.op.axis[0]].extent.value == 1 - assert bounds[x.op.axis[1]].extent.value == 1 - assert bounds[x1.op.axis[0]].extent.value == 1 - assert bounds[x1.op.axis[1]].extent == n - - -def test_bound_nest_thread(): - m = te.var("m") - A = te.placeholder((m), name="A") - A1 = te.compute((m,), lambda i: A[i], name="A1") - A2 = te.compute((m,), lambda i: A1[i] + 2, name="A2") - A3 = te.compute((m,), lambda i: A2[i] + 3, name="A3") - - s = te.create_schedule(A3.op) - s[A2].set_scope("shared") - s[A1].set_scope("local") - - block_x = te.thread_axis("blockIdx.x") - thread_x = te.thread_axis("threadIdx.x") - bx, tx = s[A3].split(A3.op.axis[0], factor=32) - s[A3].bind(bx, block_x) - s[A3].bind(tx, thread_x) - s[A2].compute_at(s[A3], tx) - _, xi = s[A2].split(A2.op.axis[0], nparts=1) - s[A2].bind(xi, thread_x) - s[A1].compute_at(s[A3], tx) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - assert bounds[A1.op.axis[0]].extent.value == 1 - assert bounds[A2.op.axis[0]].extent.value == 32 - assert bounds[A3.op.axis[0]].extent == m - - -def test_gemm_bound(): - nn = 1024 - n = tvm.runtime.convert(nn) - A = te.placeholder((n, n), name="A") - B = te.placeholder((n, n), name="B") - k = te.reduce_axis((0, n), name="k") - C = te.compute((n, n), lambda ii, jj: te.sum(A[ii, k] * B[jj, k], axis=k), name="CC") - # schedule - s = te.create_schedule(C.op) - xtile, ytile = 32, 32 - scale = 8 - num_thread = 8 - block_factor = scale * num_thread - block_x = te.thread_axis("blockIdx.x") - thread_x = te.thread_axis("threadIdx.x") - block_y = te.thread_axis("blockIdx.y") - thread_y = te.thread_axis("threadIdx.y") - - CC = s.cache_write(C, "local") - AA = s.cache_read(A, "shared", [CC]) - BB = s.cache_read(B, "shared", [CC]) - by, yi = s[C].split(C.op.axis[0], factor=block_factor) - bx, xi = s[C].split(C.op.axis[1], factor=block_factor) - s[C].reorder(by, bx, yi, xi) - s[C].bind(by, block_y) - s[C].bind(bx, block_x) - ty, yi = s[C].split(yi, nparts=num_thread) - tx, xi = s[C].split(xi, nparts=num_thread) - s[C].reorder(ty, tx, yi, xi) - s[C].bind(ty, thread_y) - s[C].bind(tx, thread_x) - yo, xo = CC.op.axis - s[CC].reorder(k, yo, xo) - - s[CC].compute_at(s[C], tx) - s[AA].compute_at(s[CC], k) - s[BB].compute_at(s[CC], k) - - ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread) - tx, xi = s[AA].split(xi, nparts=num_thread) - s[AA].bind(ty, thread_y) - s[AA].bind(tx, thread_x) - - ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread) - tx, xi = s[BB].split(xi, nparts=num_thread) - s[BB].bind(ty, thread_y) - s[BB].bind(tx, thread_x) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - assert bounds[BB.op.axis[0]].extent.value == 64 - assert bounds[AA.op.axis[0]].extent.value == 64 - assert bounds[CC.op.axis[0]].extent.value == 8 - assert bounds[CC.op.axis[1]].extent.value == 8 - - -def test_bound_tensor_compute_op(): - def intrin_test(): - m1 = te.var("m1") - n1 = te.var("n1") - a = te.placeholder((m1, n1), name="a") - c = te.compute((1, n1), lambda i, j: a[0, j] + a[1, j] + a[2, j], name="c") - - Ab = tvm.tir.decl_buffer(a.shape, name="Abuf", offset_factor=1) - Cb = tvm.tir.decl_buffer(c.shape, name="Cbuf", offset_factor=1) - - def intrin_func(ins, outs): - aa = ins[0] - cc = outs[0] - - def _body(): - ib = tvm.tir.ir_builder.create() - ib.emit( - tvm.tir.call_extern("int32", "test", cc.access_ptr("w"), aa.access_ptr("r")) - ) - return ib.get() - - return _body() - - return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, c: Cb}) - - test_func = intrin_test() - A = te.placeholder((20, 20), name="A") - B = te.compute(A.shape, lambda i, j: A[i, j], name="B") - C = te.compute((10, 20), lambda i: test_func(B[i:10, 0:20]), name="C") - s = te.create_schedule(C.op) - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - assert bounds[B.op.axis[0]].extent.value == 10 - - -def test_bound_simplification_failure(): - # Check that the bounds are not expanded - A = te.compute((2,), lambda j: j, "A") - - def _check(B, A=A): - s = te.create_schedule(B.op) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.lower(s, [B, A], simple_mode=True) - if not bounds[A.op.axis[0]].extent.value <= 2: - print(stmt) - assert bounds[A.op.axis[0]].extent.value <= 2 - - tdiv = tvm.tir.truncdiv - # These are hard to simplify, moreover we don't simplify them - _check(te.compute((10,), lambda i: A[tvm.te.min(3 * i, 4 * i) + tvm.te.min(-3 * i, -2 * i)])) - _check(te.compute((10,), lambda i: A[tvm.te.min(3 * i, 4 * i) + tvm.te.max(-3 * i, -4 * i)])) - _check(te.compute((10,), lambda i: A[-2 * tdiv(i, 2) - tvm.te.min(i, 0 - i)])) - _check(te.compute((10,), lambda i: A[i + (0 - i)])) - # This would cause out of bounds, but we nevertheless include it - _check(te.compute((10,), lambda i: A[i])) - - -def test_bound_block(): - def _check(shape, expected, block_size=4): - N, C, H, W = shape - tail = C % block_size - chunks = C // block_size - if tail != 0: - chunks += 1 - A = te.placeholder((N, C, H, W), name="A") - pad_value = tvm.tir.const(0, A.dtype) - - def _reorder_data_nchw(*indices): - condition = [] - condition.append(indices[1] == chunks - 1) - condition.append(indices[4] >= tail) - condition = tvm.tir.all(*condition) - return tvm.tir.if_then_else( - condition, - pad_value, - A[indices[0], indices[1] * block_size + indices[4], indices[2], indices[3]], - ) - - repack = te.compute((N, chunks, H, W, block_size), _reorder_data_nchw, name="repack") - B = te.compute( - (N, C, H, W), - lambda n, c, h, w: repack[n, c // block_size, h, w, c % block_size], - name="back_repack", - ) - s = te.create_schedule([B.op]) - bounds = tvm.te.schedule.InferBound(s) - # Block for intermediate compute function should be equal to 4 for all cases except than number of channels is less than 4 - assert bounds[repack.op.axis[4]].extent.value == expected - - _check((1, 4, 6, 6), 4) - _check((1, 7, 6, 6), 4) - _check((1, 3, 6, 6), 3) - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/te/test_te_schedule_bound_inference_tiling.py b/tests/python/te/test_te_schedule_bound_inference_tiling.py deleted file mode 100644 index 039fe08cd328..000000000000 --- a/tests/python/te/test_te_schedule_bound_inference_tiling.py +++ /dev/null @@ -1,62 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -from tvm import te - - -def test_bound_tile_mod(): - def compute(M_tiles, N_tiles, factor, dtype): - # Algo - M = M_tiles * factor - N = N_tiles * factor - - A = tvm.te.placeholder((N, M), name="A", dtype=dtype) - C = tvm.te.compute((N, M), lambda n, m: A[n, m], name="C") - s = tvm.te.create_schedule(C.op) - - return s, A, C - - def schedule(s, factor, padding, A, C): - C_local = s.cache_write(C, "local") - - n, m = C.op.axis - bn, bm, ni, mi = s[C].tile(n, m, factor, factor) - nio, nii = s[C].split(ni, 2) - n = s[C].fuse(nii, mi) - C_shared = s.cache_write(C, "shared") - bn, bm, ni, mi = C_shared.op.axis - s[C_shared].storage_align(ni, factor * 2, padding) - - n, m = s[C].op.axis - bn, bm, ni, mi = s[C].tile(n, m, factor, factor) - s[C].set_scope("global") - niio, niii = s[C].split(ni, 32) - s[C_shared].compute_at(s[C], niio) - - return s - - s, A, C = compute(2, 2, 128, "float16") - s = schedule(s, 128, 8, A, C) - bounds = tvm.te.schedule.InferBound(s) - check = bounds[s.stages[2].op.axis[2]].extent == 16 - if not check: - print(tvm.lower(s, [A, C], simple_mode=True)) - assert check - - -if __name__ == "__main__": - test_bound_tile_mod() diff --git a/tests/python/te/test_te_schedule_graph.py b/tests/python/te/test_te_schedule_graph.py deleted file mode 100644 index 05ca9fdbf8a8..000000000000 --- a/tests/python/te/test_te_schedule_graph.py +++ /dev/null @@ -1,142 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -from tvm import te - - -def test_scan(): - m = te.var("m") - n = te.var("n") - x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") - s_state = te.placeholder((m, n)) - s_init = te.compute((1, n), lambda _, i: x[0, i], name="s_init") - x_trans = te.compute((m, n), lambda i, j: x[i, j] + 1, name="x_trans") - s_up1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] + 1, name="up1") - s_update = te.compute((m, n), lambda t, i: s_up1[t, i] + x_trans[t, i], name="update") - s_scan = tvm.te.scan(s_init, s_update, s_state) - - def test_getbody(): - body = tvm.te.schedule.ScanGetBody(s_scan.op) - assert set(body) == set([s_scan.op, s_update.op, s_up1.op]) - - def test_attach_path(): - s = te.create_schedule(s_scan.op) - s[x_trans].compute_at(s[s_update], s_update.op.axis[0]) - apath = tvm.te.schedule.CreateAttachPath(s) - assert tuple(apath[s_update.op]) == tuple([s_scan.op.scan_axis]) - assert tuple(apath[x_trans.op]) == tuple([s_update.op.axis[0], s_scan.op.scan_axis]) - - def test_fix_pt(): - body = tvm.te.schedule.ScanGetBody(s_scan.op) - fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op) - assert fxpt[s_scan.spatial_axis_[0]].value != 0 - - -def test_scan_fix_point(): - m = te.var("m") - n = te.var("n") - l = te.var("l") - x = te.compute((l, m, n), lambda *i: tvm.tir.const(1, "float32"), name="x") - s_state = te.placeholder((l, m, n)) - s_init = te.compute((1, m, n), lambda _, i, j: x[0, i, j], name="s_init") - - def test_scan0(): - s_update = te.compute( - (l, m, n), lambda t, i, j: x[t, j, i] + s_state[t - 1, i, j], name="update" - ) - s_scan = tvm.te.scan(s_init, s_update, s_state) - body = tvm.te.schedule.ScanGetBody(s_scan.op) - fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op) - assert fxpt[s_scan.op.spatial_axis_[0]].value == 1 - assert fxpt[s_scan.op.spatial_axis_[1]].value == 1 - - def test_scan1(): - s_update = te.compute( - (l, m, n), lambda t, i, j: x[t, j, i] + s_state[t - 1, j, i], name="update" - ) - s_scan = tvm.te.scan(s_init, s_update, s_state) - body = tvm.te.schedule.ScanGetBody(s_scan.op) - fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op) - assert fxpt[s_scan.op.spatial_axis_[0]].value == 0 - assert fxpt[s_scan.op.spatial_axis_[1]].value == 0 - - def test_scan3_not_exact_reach(): - s_h1 = te.compute((l, n, m), lambda t, j, i: s_state[t - 1, i, j], name="h1") - s_h2 = te.compute((l, m, n), lambda t, i, j: s_state[t - 1, i, 10] * 2, name="h1") - s_update = te.compute( - (l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update" - ) - s_scan = tvm.te.scan(s_init, s_update, s_state) - body = tvm.te.schedule.ScanGetBody(s_scan.op) - fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op) - assert fxpt[s_scan.op.spatial_axis_[0]].value == 1 - assert fxpt[s_scan.op.spatial_axis_[1]].value == 0 - - def test_scan4_reach_other(): - s_h1 = te.compute((l, n, m), lambda t, j, i: s_state[t - 1, j, j], name="h1") - s_h2 = te.compute((l, m, n), lambda t, i, j: s_state[t - 1, i, j] * 2, name="h1") - s_update = te.compute( - (l, m, n), lambda t, i, j: s_h1[t, j, i] + s_h2[t, i, j], name="update" - ) - s_scan = tvm.te.scan(s_init, s_update, s_state) - fxpt = tvm.te.schedule.ScanFixPointAnalysis(s_scan.op) - assert fxpt[s_scan.op.spatial_axis_[0]].value == 0 - assert fxpt[s_scan.op.spatial_axis_[1]].value == 0 - - def test_scan5_multi_output(): - m = te.var("m") - n = te.var("n") - x1 = te.placeholder((m, n)) - s1 = te.placeholder((m, n)) - x2 = te.placeholder((m, n)) - s2 = te.placeholder((m, n)) - s1_init = te.compute((1, n), lambda _, i: x1[0, i]) - s2_init = te.compute((1, n), lambda _, i: x2[0, i]) - s1_update = te.compute((m, n), lambda t, i: s1[t - 1, i] + x1[t, i]) - s2_update = te.compute((m, n), lambda t, i: x2[t, i] + s2[t - 1, i]) - r0, r1 = tvm.te.scan([s1_init, s2_init], [s1_update, s2_update], [s1, s2]) - body = tvm.te.schedule.ScanGetBody(r0.op) - fxpt = tvm.te.schedule.ScanFixPointAnalysis(r0.op) - assert fxpt[r1.op.spatial_axis_[0]].value == 1 - - test_scan0() - test_scan1() - test_scan3_not_exact_reach() - test_scan4_reach_other() - test_scan5_multi_output() - - -def test_create_read_graph(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j]) - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3) - - g = tvm.te.schedule.CreateReadGraph([A2.op]) - - assert g[A2.op][0] == A1 - assert g[A1.op][0] == A - post_order = tvm.te.schedule.PostDFSOrder([A2.op], g) - assert post_order[0] == A.op - assert post_order[1] == A1.op - - -if __name__ == "__main__": - test_scan() - test_create_read_graph() - test_scan_fix_point() diff --git a/tests/python/te/test_te_schedule_lstm.py b/tests/python/te/test_te_schedule_lstm.py deleted file mode 100644 index abdf81d3795d..000000000000 --- a/tests/python/te/test_te_schedule_lstm.py +++ /dev/null @@ -1,91 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -from tvm import te - - -def test_lstm_cell_inline(): - num_step = 128 - num_input = 256 - num_hidden = 1152 - batch_size = 4 - # Global transition matrix - X = te.placeholder((num_step - 1, batch_size, num_input), name="X") - Wi2h = te.placeholder((4, num_hidden, num_input), name="Wi2h") - Wh2h = te.placeholder((4, num_hidden, num_hidden), name="Wh2h") - # h: output hidden state, c: cell state. - s_state_h = te.placeholder((num_step, batch_size, num_hidden)) - s_state_c = te.placeholder((num_step, batch_size, num_hidden)) - s_init_c = te.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_c") - s_init_h = te.compute((1, batch_size, num_hidden), lambda *i: 0.0, name="init_h") - # LSTM transition - k = te.reduce_axis((0, num_input), name="ki2h") - s_i2h = te.compute( - (num_step, 4, batch_size, num_hidden), - lambda t, x, i, j: te.sum(X[t - 1, i, k] * Wi2h[x, j, k], axis=k), - name="s_i2h", - ) - k = te.reduce_axis((0, num_hidden), name="ki2h") - s_h2h = te.compute( - (num_step, 4, batch_size, num_hidden), - lambda t, x, i, j: te.sum(s_state_h[t - 1, i, k] * Wh2h[x, j, k], axis=k), - name="s_h2h", - ) - # Gate rules - gates = te.compute(s_i2h.shape, lambda *i: s_i2h(*i) + s_h2h(*i), name="gates") - gshape = (num_step, batch_size, num_hidden) - in_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, 0, i, j]), name="in_gate") - in_transform = te.compute( - gshape, lambda t, i, j: te.tanh(gates[t, 1, i, j]), name="in_transform" - ) - forget_gate = te.compute( - gshape, lambda t, i, j: te.sigmoid(gates[t, 2, i, j]), name="forget_gate" - ) - out_gate = te.compute(gshape, lambda t, i, j: te.sigmoid(gates[t, 3, i, j]), name="out_gate") - next_c = te.compute( - gshape, - lambda t, i, j: forget_gate[t, i, j] * s_state_c[t - 1, i, j] - + in_gate[t, i, j] * in_transform[t, i, j], - name="next_c", - ) - next_h = te.compute( - gshape, lambda t, i, j: out_gate[t, i, j] * te.tanh(next_c[t, i, j]), name="next_h" - ) - update_c = te.compute(gshape, lambda *i: next_c(*i), name="update_c") - update_h = te.compute(gshape, lambda *i: next_h(*i), name="update_h") - # schedule - scan_h, scan_c = tvm.te.scan( - [s_init_h, s_init_c], - [update_h, update_c], - [s_state_h, s_state_c], - inputs=[X], - name="lstm_scan", - ) - # schedule - s = te.create_schedule(scan_h.op) - # Inline gate computations - s[gates].compute_inline() - s[in_gate].compute_inline() - s[in_transform].compute_inline() - s[forget_gate].compute_inline() - s[out_gate].compute_inline() - # verify we can lower correctly - tvm.lower(s, [X, Wi2h, Wh2h, scan_h, scan_c]) - - -if __name__ == "__main__": - test_lstm_cell_inline() diff --git a/tests/python/te/test_te_schedule_ops.py b/tests/python/te/test_te_schedule_ops.py deleted file mode 100644 index 1ff0297539ce..000000000000 --- a/tests/python/te/test_te_schedule_ops.py +++ /dev/null @@ -1,695 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import numpy as np - -import tvm -from tvm import te -from tvm.driver.build_module import schedule_to_module - - -def test_const(): - x = tvm.te.const(1, "int32") - assert x.dtype == "int32" - assert isinstance(x, tvm.tir.IntImm) - - -def test_schedule0(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - s = te.create_schedule(A1.op) - - mod = schedule_to_module(s, [A, A1]) - assert isinstance(mod["main"], tvm.tir.PrimFunc) - - -def test_schedule1(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - - s = te.create_schedule(A1.op) - xo, xi = s[A1].split(A1.op.axis[0], 8) - s[A1].pragma(xo, "auto_unroll_max_step", 10) - - mod = schedule_to_module(s, [A, A1]) - assert isinstance(mod["main"], tvm.tir.PrimFunc) - - -def test_schedule2(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - - s = te.create_schedule(A2.op) - xo, xi = s[A2].split(A2.op.axis[0], 8) - s[A1].compute_at(s[A2], xo) - - mod = schedule_to_module(s, [A, A2]) - assert isinstance(mod["main"], tvm.tir.PrimFunc) - - -def test_schedule_scan(): - m = te.var("m") - n = te.var("n") - x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") - s_state = te.placeholder((m, n)) - s_init = te.compute((1, n), lambda _, i: x[0, i]) - s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + x[t, i]) - res = tvm.te.scan(s_init, s_update, s_state) - - assert tuple(res.shape) == (m, n) - s = te.create_schedule(res.op) - s = s.normalize() - ir = tvm.lower(s, [s_state], simple_mode=True) - bounds = tvm.te.schedule.InferBound(s) - assert bounds[res.op.scan_axis].min.value == 1 - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_inline_multi_reduce(): - def argmax_comp(x, y): - idx = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) - val = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) - return idx, val - - def argmax_init(idx_typ, val_typ): - return tvm.tir.const(-1, idx_typ), tvm.te.min_value(val_typ) - - argmax = te.comm_reducer(argmax_comp, argmax_init, name="argmax") - m = te.var("m") - n = te.var("n") - val = te.placeholder((m, n), name="val", dtype="float32") - val1 = te.compute((m, n), lambda i, j: val[i, j] + 1, name="val1") - val2 = te.compute((m, n), lambda i, j: te.exp(val1[i, j]), name="val2") - k = te.reduce_axis((0, n), "k") - T_idx, T_val = te.compute((m,), lambda i: argmax((k.var, val2[i, k]), axis=k), name="T") - s = te.create_schedule(T_idx.op) - s[val1].compute_inline() - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_auto_inline(): - def elemwise(): - m = te.var("m") - n = te.var("n") - A = te.placeholder((m, n), name="A") - B = te.placeholder((m, n), name="B") - C = te.placeholder((m, n), name="C") - T1 = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="T1") - T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2") - - return te.create_schedule(T2.op), T1 - - def broadcast(): - m = te.var("m") - n = te.var("n") - A = te.placeholder((1,), name="A") - B = te.placeholder((m, n), name="B") - C = te.placeholder((m, n), name="C") - T1 = te.compute((m, n), lambda i, j: A(0) * B(i, j), name="T1", tag="broadcast") - T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2") - - return te.create_schedule(T2.op), T1 - - def injective(): - m = te.var("m") - n = te.var("n") - A = te.placeholder((m,), name="A") - B = te.placeholder((m, n), name="B") - C = te.placeholder((m, n), name="C") - T1 = te.compute((m, n), lambda i, j: A(i) * B(i, j), name="T1") - T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2") - - return te.create_schedule(T2.op), T1 - - def check_auto_inline(schedule_func, auto_inline_func): - s, T1 = schedule_func() - # before auto inline the attach type is AttachType.kGroupRoot - assert s[T1].attach_type == 1 - auto_inline_func(s) - # after auto inline the attach type is AttachType.kInline - assert s[T1].attach_type == 2 - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - check_auto_inline(elemwise, tvm.te.schedule.AutoInlineElemWise) - check_auto_inline(broadcast, tvm.te.schedule.AutoInlineBroadcast) - check_auto_inline(injective, tvm.te.schedule.AutoInlineInjective) - - -def test_schedule_const_bound(): - n = 128 - A = te.placeholder((n,), name="A") - A1 = te.compute((n,), lambda i: A[i] + 1, name="A1") - s = te.create_schedule(A1.op) - xo, xi = s[A1].split(A1.op.axis[0], 8) - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_inline_mixed(): - n = te.var("n") - A = te.placeholder((n,), name="A") - A1 = te.compute(A.shape, lambda *i: A(*i) + 1, name="A1") - A2 = te.compute(A.shape, lambda *i: A1(*i) + 2, name="A2") - C = te.compute((n,), lambda i: A2[i] + A1[i], name="C") - - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=8) - s[A1].compute_at(s[C], xo) - s[A2].compute_inline() - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - def check(x): - if isinstance(x, tvm.tir.Call): - assert x.func != A2 - - tvm.tir.stmt_functor.post_order_visit(s[C].op.body[0], check) - - -def test_scan_inline1(): - m = te.var("m") - n = te.var("n") - x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") - s_state1 = te.placeholder((m, n)) - s_state2 = te.placeholder((m, n)) - s_init1 = te.compute((1, n), lambda _, i: x[0, i]) - s_init2 = te.compute((1, n), lambda _, i: x[0, i]) - s_x1 = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + x[t, i], name="x1") - s_x2 = te.compute((m, n), lambda t, i: s_state2[t - 1, i] + 1, name="x2") - s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1") - s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2") - res1, res2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2]) - s = te.create_schedule(res1.op) - s[s_x1].compute_inline() - stmt = tvm.lower(s, [x, res1, res2]) - - -def test_scan_inline2(): - m = te.var("m") - n = te.var("n") - x = te.compute((m, n), lambda i, j: tvm.tir.const(1, "float32"), name="x") - s_state1 = te.placeholder((m, n)) - s_state2 = te.placeholder((m, n)) - s_init1 = te.compute((1, n), lambda _, i: x[0, i]) - s_init2 = te.compute((1, n), lambda _, i: x[0, i]) - s_xx = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + x[t, i], name="xx") - s_x1 = te.compute((m, n), lambda t, i: s_xx[t, i] + 1, name="x1") - s_x2 = te.compute((m, n), lambda t, i: s_xx[t, i] + s_state2[t - 1, 2], name="x2") - s_update1 = te.compute((m, n), lambda t, i: s_x1[t, i], "u1") - s_update2 = te.compute((m, n), lambda t, i: s_x2[t, i], "u2") - res1, res2 = tvm.te.scan([s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2]) - s = te.create_schedule(res1.op) - s[s_xx].compute_inline() - s[s_x1].compute_inline() - s[s_x2].compute_inline() - stmt = tvm.lower(s, [x, res1, res2]) - - -def test_schedule_cache(): - m = te.var("m") - n = te.var("n") - A = te.placeholder((m, n), name="A") - B = te.placeholder((m, n), name="B") - C = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="C") - - s = te.create_schedule(C.op) - AA = s.cache_read(A, "shared", readers=[C]) - CC = s.cache_write(C, "shared") - s[AA].compute_at(s[CC], CC.op.axis[0]) - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_schedule_middle_cache(): - m = te.var("m") - n = te.var("n") - A = te.placeholder((m, n), name="A") - B = te.placeholder((m, n), name="B") - - C = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="C") - D = te.compute((m, n), lambda i, j: C(i, j), name="D") - - s = te.create_schedule(D.op) - AA = s.cache_read(A, "local", readers=[C]) - BB = s.cache_read(B, "local", readers=[C]) - CC = s.cache_read(C, "local", readers=[D]) - DD = s.cache_write(D, "local") - # s[AA].compute_at(s[CC], CC.op.axis[0]) - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_schedule_cache_relayout1(): - m = te.var("m") - n = te.var("n") - A = te.placeholder((m, n), name="A") - B = te.placeholder((m, n), name="B") - C = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="C") - - s = te.create_schedule(C.op) - s[C].reorder(C.op.axis[1], C.op.axis[0]) - CC = s.cache_write(C, "global") - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_schedule_cache_relayout2(): - m = te.var("m") - n = te.var("n") - A = te.placeholder((m * 4, n), name="A") - B = te.placeholder((m * 4, n), name="B") - C = te.compute(A.shape, lambda i, j: A(i, j) * B(i, j), name="C") - s = te.create_schedule(C.op) - x, y = C.op.axis - xo, xi = s[C].split(x, factor=4) - s[C].reorder(xo, y, xi) - CC = s.cache_write(C, "global") - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_schedule_cache_relayout3(): - m = te.var("m") - n = te.var("n") - A = te.placeholder((m * 4, n), name="A") - B = te.placeholder((m * 4, n), name="B") - k = te.reduce_axis((0, n), "k") - C = te.compute((A.shape[0],), lambda i: te.sum(A(i, k) * B(i, k), axis=k), name="C") - s = te.create_schedule(C.op) - x = C.op.axis[0] - xo, xi = s[C].split(x, factor=4) - CC = s.cache_write(C, "global") - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_schedule_cache_relayout4(): - def _compute(*indice): - return A(*indice) + 1, B(*indice) / 2 - - m = te.var("m") - n = te.var("n") - A = te.placeholder((m * 4, n), name="A") - B = te.placeholder((m * 4, n), name="B") - C1, C2 = te.compute(A.shape, _compute, name="C") - s = te.create_schedule([C1.op, C2.op]) - C1_cache, C2_cache = s.cache_write([C1, C2], "local") - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def intrin_gemv(m, n): - w = te.placeholder((m, n), name="w") - x = te.placeholder((n,), name="x") - k = te.reduce_axis((0, n), name="k") - z = te.compute((m,), lambda i: te.sum(w[i, k] * x[k], axis=k), name="z") - Wb = tvm.tir.decl_buffer( - w.shape, w.dtype, name="W", offset_factor=16, strides=[te.var("ldw"), 1] - ) - - def intrin_func(ins, outs): - ww, xx = ins - zz = outs[0] - ww_ptr = ww.access_ptr("r") - xx_ptr = xx.access_ptr("r") - zz_ptr = zz.access_ptr("w") - body = tvm.tir.call_packed("gemm", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) - reset = tvm.tir.call_packed("fill_zero", zz_ptr, n) - update = tvm.tir.call_packed("gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) - return body, reset, update - - buffer_params = {"data_alignment": 16, "offset_factor": 16} - return te.decl_tensor_intrin( - z.op, intrin_func, binds={w: Wb}, default_buffer_params=buffer_params - ) - - -def test_schedule_tensor_compute1(): - # basic: split, reorder, tile - M, N, L = 2048, 1024, 512 - factor, rfactor = 16, 16 - A = te.placeholder((N // factor, L // rfactor, factor, rfactor), name="A") - B = te.placeholder((M, L // rfactor, rfactor), name="B") - k = te.reduce_axis((0, L // rfactor), name="k") - - gemv = intrin_gemv(factor, rfactor) - C = te.compute( - (N, M // factor, factor), - lambda i, j: gemv(A[i, k, 0:factor, 0:factor], B[j, k, 0:rfactor], reduce_axis=k), - name="C", - ) - - s = te.create_schedule(C.op) - ai, aj, ax = s[C].op.axis - aio, aii = s[C].split(ai, 16) - s[C].reorder(aio, aj, aii) - aioo, ajo, aioi, aji = s[C].tile(aio, aj, 16, 4) - - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def intrin_vadd(n, cache_read=False, cache_write=False): - scope_ubuf = "local" - dtype = "float32" - x = te.placeholder((n,), dtype=dtype, name="vx") - y = te.placeholder((n,), dtype=dtype, name="vy") - z = te.compute(x.shape, lambda i: x[i] + y[i], name="z") - s = te.create_schedule(z.op) - - def create_buffer(t): - return tvm.tir.decl_buffer( - t.shape, t.dtype, name="W" + t.name, scope=scope_ubuf, offset_factor=16 - ) - - binds = {} - if cache_read: - binds[x] = create_buffer(x) - binds[y] = create_buffer(y) - if cache_write: - binds[z] = create_buffer(z) - - def intrin_func(ins, outs): - ib = tvm.tir.ir_builder.create() - ib.emit( - tvm.tir.call_extern( - outs[0].dtype, - "vadd", - ins[0].access_ptr("r"), - ins[1].access_ptr("r"), - outs[0].access_ptr("wr"), - ) - ) - return ib.get() - - return te.decl_tensor_intrin( - z.op, intrin_func, binds=binds, default_buffer_params={"offset_factor": 16} - ) - - -def test_schedule_tensor_compute2(): - # cache_read, cache_write - M = 1024 - factor = 16 - dtype = "float32" - scope_ubuf = "local" - - A = te.placeholder((M // factor, factor), name="A", dtype=dtype) - B = te.placeholder((M // factor, factor), name="B", dtype=dtype) - - vadd = intrin_vadd(factor, True, True) - C = te.compute((M // factor, factor), lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name="C") - - s = te.create_schedule(C.op) - AL = s.cache_read(A, scope_ubuf, C) - BL = s.cache_read(B, scope_ubuf, C) - CL = s.cache_write(C, scope_ubuf) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_schedule_tensor_compute3(): - # compute_at - M = 1024 - factor = 16 - dtype = "float32" - A = te.placeholder((M // factor, factor), name="A", dtype=dtype) - B = te.placeholder((M // factor, factor), name="B", dtype=dtype) - Bi = te.compute((M // factor, factor), lambda i, j: B[i, j] + 5, name="Bi") - - vadd = intrin_vadd(factor) - C = te.compute((M // factor, factor), lambda i: vadd(A[i, 0:factor], Bi[i, 0:factor]), name="C") - s = te.create_schedule(C.op) - s[Bi].compute_at(s[C], C.op.axis[0]) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_loop_dep_reduce(): - X = te.placeholder(shape=(10,), name="x") - - def f(n): - rv = te.reduce_axis((0, n)) - return te.sum(X[rv], axis=rv) - - Y = te.compute(X.shape, f, name="y") - s = te.create_schedule([Y.op]) - f = tvm.build(s, [X, Y]) - - -def test_loop_dep_reduce_cache_write(): - X = te.placeholder(shape=(10,), name="x") - - def f(n): - rv = te.reduce_axis((0, n)) - init = lambda dtype: tvm.tir.Select(n > 1, tvm.tir.const(0, dtype), n.astype(dtype)) - sum = te.comm_reducer(lambda x, y: tvm.te.max(x + y, n.astype("float32")), init, name="sum") - return sum(X[rv], axis=rv) - - Y = te.compute(X.shape, f, name="y") - s = te.create_schedule([Y.op]) - s.cache_write(Y, "local") - f = tvm.build(s, [X, Y]) - - -def test_reduction_and_dummy_fuse_split(): - n = 10 - X = te.placeholder(shape=(n,), dtype="int32", name="X") - k = te.reduce_axis((0, n)) - Y = te.compute((), lambda: te.sum(X[k], k), name="Y") - s = te.create_schedule([Y.op]) - ax = s[Y.op].fuse(*Y.op.axis) - axo, axi = s[Y.op].split(ax, nparts=20) - f = tvm.build(s, [Y, X]) - - args = [tvm.nd.empty((), "int32")] + [tvm.nd.array(np.ones((n,), dtype="int32"))] - f(*args) - assert args[0].numpy() == n - - n = 10 - X = te.placeholder(shape=(n,), dtype="int32", name="X") - k = te.reduce_axis((0, n)) - Y = te.compute((n,), lambda i: te.sum(X[k], k), name="Y") - s = te.create_schedule([Y.op]) - ax = s[Y.op].fuse(*(list(Y.op.axis) + list(Y.op.reduce_axis))) - f = tvm.build(s, [Y, X]) - - args = [tvm.nd.array(np.ones((n,), dtype="int32"))] + [ - tvm.nd.array(np.ones((n,), dtype="int32")) - ] - f(*args) - assert np.all(args[0].numpy() == n) - - -def test_schedule_compute_inline(): - shape = [10, 1024] - A = te.placeholder(shape, name="A") - B = te.placeholder(shape, name="B") - C = te.compute(shape, lambda *index: A(*index) + B(*index), name="C") - - def _compute(*index): - return C(*index), C(*index) * B(*index) - - F, E = te.compute(shape, _compute, name="F") - - s = te.create_schedule([F.op, E.op]) - AL = s.cache_read(A, "local", [C]) - BL = s.cache_read(B, "local", [C, E]) - CL = s.cache_write(C, "local") - FL, EL = s.cache_write([F, E], "local") - s[C].compute_inline() - - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - -def test_local_stage_predicate(): - m = 1 - n = 3 - p = 2 - A = tvm.te.placeholder((m, n, p), name="A") - B = tvm.te.compute((m, n, p), lambda bi, bj, bk: A[bi, bj, bk], name="B") - C = tvm.te.compute((m, n, p), lambda ci, cj, ck: B[ci, cj, ck], name="C") - by = tvm.te.thread_axis("blockIdx.y") - tx = tvm.te.thread_axis("threadIdx.x") - vx = tvm.te.thread_axis("vthread") - - def schedule(thread_tag, mem_scope): - s = tvm.te.create_schedule(C.op) - s[B].compute_at(s[C], s[C].op.axis[0]) - s[B].set_scope(mem_scope) - bno, bni = s[B].split(s[B].op.axis[1], n) - bx = tvm.te.thread_axis("blockIdx.x") - s[C].bind(s[C].op.axis[0], bx) - s[C].bind(s[C].op.axis[1], thread_tag) - s[B].bind(bni, thread_tag) - return s - - def collect_visit(stmt, f): - ret = [] - tvm.tir.stmt_functor.post_order_visit(stmt, lambda x: ret.append(f(x))) - return ret - - # local vs. threadIdx - s = schedule(tx, "local") - lowered_body = tvm.lower(s, [A, C])["main"].body - assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse))) - # local vs. vthread - s = schedule(vx, "local") - lowered_body = tvm.lower(s, [A, C])["main"].body - assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse))) - # shared vs. blockIdx - s = schedule(by, "shared") - lowered_body = tvm.lower(s, [A, C])["main"].body - assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse))) - - -def test_local_stage_predicate2(): - A = tvm.te.placeholder((128,), name="A") - B = tvm.te.compute((128,), lambda bi: A[bi] + 1, name="B") - C = tvm.te.compute((128,), lambda ci: B[ci] + 2, name="C") - s = tvm.te.create_schedule(C.op) - AA = s.cache_read(A, "local", [B]) - s[B].set_scope("shared") - block_x = tvm.te.thread_axis("blockIdx.x") - thread_x = tvm.te.thread_axis((0, 32), "threadIdx.x") - oc, ic = s[C].split(s[C].op.axis[0], factor=64) - ooc, ioc = s[C].split(oc, factor=2) - oic, iic = s[C].split(ic, factor=32) - s[C].bind(ooc, block_x) - s[C].bind(iic, thread_x) - s[B].compute_at(s[C], ioc) - ob, ib = s[B].split(s[B].op.axis[0], factor=32) - s[B].bind(ib, thread_x) - s[AA].compute_root() - s[AA].compute_at(s[C], ooc) - oaa, iaa = s[AA].split(s[AA].op.axis[0], factor=32) - s[AA].bind(iaa, thread_x) - lowered_body = tvm.lower(s, [A, C])["main"].body - - def collect_visit(stmt, f): - ret = [] - tvm.tir.stmt_functor.post_order_visit(stmt, lambda x: ret.append(f(x))) - return ret - - def visit_stmt(op): - if isinstance(op, tvm.tir.Allocate): - return op.extents[0].value == 97 - return False - - assert not any(collect_visit(lowered_body, lambda x: isinstance(x, tvm.tir.IfThenElse))) - assert any(collect_visit(lowered_body, visit_stmt)) - - -def test_schedule_record_gemm(): - with tvm.transform.PassContext(config={"te.keep_schedule_record": True}): - M, K, N = 1024, 1024, 1024 - k = te.reduce_axis((0, K), "k") - A = te.placeholder((M, K), name="A") - B = te.placeholder((K, N), name="B") - C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C") - s = te.create_schedule(C.op) - # currently there are no other applied primitives - # size of schedule record is expected to be 1 (vanilla schedule) - assert len(s.schedule_record) == 1 - # apply sequential optimizatoin primitives - block_size, factor = 32, 8 - # tile -> split + split + reorder - mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], block_size, block_size) - ko, ki = s[C].split(k, factor=factor) - s[C].reorder(mo, ko, no, mi, ki, ni) - s[C].vectorize(ni) - s[C].parallel(mo) - assert len(s.schedule_record) == 8 - # compare primitive names - expected_names = [ - "vanilla", - "split", - "split", - "reorder", - "split", - "reorder", - "vectorize", - "parallel", - ] - for i in range(len(s.schedule_record)): - assert s.primitive_record[i] == expected_names[i] - - -def test_schedule_record_misc(): - s = te.create_schedule([]) - # size of schedule record is expected to be 0 (no storing behavior) - assert len(s.schedule_record) == 0 - - with tvm.transform.PassContext(config={"te.keep_schedule_record": True}): - s = te.create_schedule([]) - # size of schedule record is expected to be 1 (vanilla schedule) - assert len(s.schedule_record) == 1 - - stg = te.compute((), lambda *args: 0, name="empty_op") - s = te.create_schedule(stg.op) - # size of schedule record is expected to be 1 (vanilla schedule) - assert len(s.schedule_record) == 1 - - -if __name__ == "__main__": - test_loop_dep_reduce() - test_loop_dep_reduce_cache_write() - test_schedule_middle_cache() - test_inline_multi_reduce() - test_schedule_cache_relayout4() - test_schedule_cache_relayout3() - test_schedule_cache_relayout2() - test_schedule_cache_relayout1() - test_schedule_const_bound() - test_scan_inline1() - test_scan_inline2() - test_inline_mixed() - test_auto_inline() - test_schedule_scan() - test_schedule0() - test_schedule1() - test_schedule2() - test_schedule_cache() - test_schedule_tensor_compute1() - test_schedule_tensor_compute2() - test_schedule_tensor_compute3() - test_reduction_and_dummy_fuse_split() - test_schedule_compute_inline() - test_local_stage_predicate() - test_local_stage_predicate2() - test_schedule_record_gemm() - test_schedule_record_misc() diff --git a/tests/python/te/test_te_schedule_postproc_rewrite_for_tensor_core.py b/tests/python/te/test_te_schedule_postproc_rewrite_for_tensor_core.py deleted file mode 100644 index 83584ad56400..000000000000 --- a/tests/python/te/test_te_schedule_postproc_rewrite_for_tensor_core.py +++ /dev/null @@ -1,231 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# 'License'); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -from tvm import te -from tvm import topi -import numpy as np -import tvm.testing - - -def tensor_core_matmul(warp_tile_m=16, m=64, n=32, l=96): - A = te.placeholder((n, l), name="A", dtype="float16") - B = te.placeholder((l, m), name="B", dtype="float16") - k = te.reduce_axis((0, l), name="k") - C = te.compute( - (n, m), lambda i, j: te.sum(A[i, k].astype("float32") * B[k, j].astype("float32"), axis=k) - ) - s = te.create_schedule(C.op) - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - AA = s.cache_read(A, "shared", [C]) - AL = s.cache_read(AA, "local", [C]) - BB = s.cache_read(B, "shared", [C]) - BL = s.cache_read(BB, "local", [C]) - CL = s.cache_write(C, "local") - - bx = 4 - by = 32 - step_k = 8 - v = 4 - TX = 8 - TY = 1 - tile_x = bx * TX - tile_y = by * TY - WX = min(warp_tile_m, tile_x) - tile_k = 16 - vthread = 1 - - yo, ty = s[C].split(y, tile_y * vthread) - vy, ty = s[C].split(ty, tile_y) - ty, yi = s[C].split(ty, TY) - - xo, xi = s[C].split(x, tile_x) - tz, xi = s[C].split(xi, WX) - tx, xi = s[C].split(xi, TX) - ko, ki = s[CL].split(k, step_k * tile_k) - kl, ki = s[CL].split(ki, tile_k) - - s[C].reorder(yo, xo, tz, ty, tx, yi, xi) - s[C].bind(yo, te.thread_axis("blockIdx.y")) - s[C].bind(xo, te.thread_axis("blockIdx.x")) - s[C].bind(ty, te.thread_axis("threadIdx.y")) - s[C].bind(tz, te.thread_axis("threadIdx.z")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy")) - s[CL].compute_at(s[C], tx) - yo, xo = CL.op.axis - s[CL].reorder(ko, kl, ki, yo, xo) - - s[AA].compute_at(s[CL], ko) - xo, xi = s[AA].split(s[AA].op.axis[1], factor=bx * v) - tz, tx = s[AA].split(xi, factor=(WX // TX) * v) - tx, vec = s[AA].split(tx, factor=v) - fused = s[AA].fuse(s[AA].op.axis[0], xo) - _, ty = s[AA].split(fused, factor=by) - s[AA].bind(ty, te.thread_axis("threadIdx.y")) - s[AA].bind(tz, te.thread_axis("threadIdx.z")) - s[AA].bind(tx, te.thread_axis("threadIdx.x")) - s[AA].vectorize(vec) - - s[BB].compute_at(s[CL], ko) - xo, xi = s[BB].split(s[BB].op.axis[1], factor=bx * v) - tz, tx = s[BB].split(xi, factor=(WX // TX) * v) - tx, vec = s[BB].split(tx, factor=v) - fused = s[BB].fuse(s[BB].op.axis[0], xo) - _, ty = s[BB].split(fused, factor=by) - s[BB].bind(ty, te.thread_axis("threadIdx.y")) - s[BB].bind(tz, te.thread_axis("threadIdx.z")) - s[BB].bind(tx, te.thread_axis("threadIdx.x")) - s[BB].vectorize(vec) - - s[AL].compute_at(s[CL], kl) - s[BL].compute_at(s[CL], kl) - - s[CL].pragma(ko, "tensor_core") - - func = tvm.build(s, [A, B, C], "cuda") - - dev = tvm.cuda(0) - a_np = np.random.uniform(size=(n, l)).astype(A.dtype) - b_np = np.random.uniform(size=(l, m)).astype(B.dtype) - c_np = np.zeros((n, m), dtype=np.float32) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev) - func(a, b, c) - evaluator = func.time_evaluator(func.entry_name, dev, number=3) - print("gemm m=%d n=%d k=%d: %f ms" % (m, n, l, evaluator(a, b, c).mean * 1e3)) - - c_np = np.dot(a_np, b_np) - np.testing.assert_allclose(c_np, c.numpy(), rtol=1e-3) - - -def tensor_core_batch_matmul(warp_tile_m=16, m=64, n=32, l=96, batch=2): - A = te.placeholder((batch, n, l), name="A", dtype="float16") - B = te.placeholder((batch, l, m), name="B", dtype="float16") - k = te.reduce_axis((0, l), name="k") - C = te.compute( - (batch, n, m), lambda b, i, j: te.sum((A[b, i, k] * B[b, k, j]).astype("float32"), axis=k) - ) - s = te.create_schedule(C.op) - z, y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - AA = s.cache_read(A, "shared", [C]) - AL = s.cache_read(AA, "local", [C]) - BB = s.cache_read(B, "shared", [C]) - BL = s.cache_read(BB, "local", [C]) - CL = s.cache_write(C, "local") - - bx = 2 - by = 32 - step_k = 8 - v = 4 - TX = 8 - TY = 1 - tile_x = bx * TX - tile_y = by * TY - WX = min(warp_tile_m, tile_x) - tile_k = 16 - vthread = 1 - - yo, ty = s[C].split(y, tile_y * vthread) - vy, ty = s[C].split(ty, tile_y) - ty, yi = s[C].split(ty, TY) - - xo, xi = s[C].split(x, tile_x) - tz, xi = s[C].split(xi, WX) - tx, xi = s[C].split(xi, TX) - ko, ki = s[CL].split(k, step_k * tile_k) - kl, ki = s[CL].split(ki, tile_k) - - s[C].reorder(z, yo, xo, tz, ty, tx, yi, xi) - s[C].bind(z, te.thread_axis("blockIdx.z")) - s[C].bind(yo, te.thread_axis("blockIdx.y")) - s[C].bind(xo, te.thread_axis("blockIdx.x")) - s[C].bind(ty, te.thread_axis("threadIdx.y")) - s[C].bind(tz, te.thread_axis("threadIdx.z")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - s[C].bind(vy, te.thread_axis((0, vthread), "vthread", name="vy")) - s[CL].compute_at(s[C], tx) - zo, yo, xo = CL.op.axis - s[CL].reorder(ko, kl, ki, zo, yo, xo) - - s[AA].compute_at(s[CL], ko) - xo, xi = s[AA].split(s[AA].op.axis[2], factor=bx * v) - tz, tx = s[AA].split(xi, factor=(WX // TX) * v) - tx, vec = s[AA].split(tx, factor=v) - fused = s[AA].fuse(s[AA].op.axis[1], xo) - _, ty = s[AA].split(fused, factor=by) - s[AA].bind(ty, te.thread_axis("threadIdx.y")) - s[AA].bind(tz, te.thread_axis("threadIdx.z")) - s[AA].bind(tx, te.thread_axis("threadIdx.x")) - s[AA].vectorize(vec) - - s[BB].compute_at(s[CL], ko) - xo, xi = s[BB].split(s[BB].op.axis[2], factor=bx * v) - tz, tx = s[BB].split(xi, factor=(WX // TX) * v) - tx, vec = s[BB].split(tx, factor=v) - fused = s[BB].fuse(s[BB].op.axis[1], xo) - _, ty = s[BB].split(fused, factor=by) - s[BB].bind(ty, te.thread_axis("threadIdx.y")) - s[BB].bind(tz, te.thread_axis("threadIdx.z")) - s[BB].bind(tx, te.thread_axis("threadIdx.x")) - s[BB].vectorize(vec) - - s[AL].compute_at(s[CL], kl) - s[BL].compute_at(s[CL], kl) - - s[CL].pragma(ko, "tensor_core") - - func = tvm.build(s, [A, B, C], "cuda") - - dev = tvm.cuda(0) - a_np = np.random.uniform(size=(batch, n, l)).astype(A.dtype) - b_np = np.random.uniform(size=(batch, l, m)).astype(B.dtype) - c_np = np.zeros((batch, n, m), dtype=np.float32) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros((batch, n, m), dtype=C.dtype), dev) - func(a, b, c) - evaluator = func.time_evaluator(func.entry_name, dev, number=3) - print( - "batch gemm m=%d n=%d k=%d batch=%d: %f ms" - % (m, n, l, batch, evaluator(a, b, c).mean * 1e3) - ) - - for bs in range(batch): - c_np[bs, :, :] = np.dot(a_np[bs, :, :], b_np[bs, :, :]) - np.testing.assert_allclose(c_np, c.numpy(), rtol=1e-3) - - -@tvm.testing.requires_tensorcore -def test_tensor_core_matmul(): - tensor_core_matmul(16) # test with warp_tile 16x16x16 - tensor_core_matmul(8) # test with warp_tile 8x32x16 - tensor_core_matmul(32) # test with warp_tile 32x8x16 - - -@tvm.testing.requires_tensorcore -def test_tensor_core_batch_matmul(): - tensor_core_batch_matmul() - - -if __name__ == "__main__": - test_tensor_core_matmul() - test_tensor_core_batch_matmul() diff --git a/tests/python/te/test_te_schedule_tensor_core.py b/tests/python/te/test_te_schedule_tensor_core.py deleted file mode 100644 index d86b05ad83f1..000000000000 --- a/tests/python/te/test_te_schedule_tensor_core.py +++ /dev/null @@ -1,461 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# 'License'); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -from tvm import te -import numpy as np -from tvm.topi.testing import conv2d_nhwc_python -import tvm.testing - -VERIFY = True - - -def intrin_wmma_load_matrix(shape, scope): - n, m, l = shape - if scope == "wmma.matrix_a": - row, col = n, l - elif scope == "wmma.matrix_b": - row, col = l, m - A = te.placeholder((row, col), name="A", dtype="float16") - BA = tvm.tir.decl_buffer( - A.shape, A.dtype, scope="shared", data_alignment=32, offset_factor=row * col - ) - C = te.compute((row, col), lambda i, j: A[i, j], name="C") - BC = tvm.tir.decl_buffer( - C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=row * col - ) - - def intrin_func(ins, outs): - ib = tvm.tir.ir_builder.create() - - BA = ins[0] - BC = outs[0] - ib.emit( - tvm.tir.call_intrin( - "handle", - "tir.tvm_load_matrix_sync", - BC.data, - n, - m, - l, - BC.elem_offset // (row * col), - BA.access_ptr("r"), - col, - "row_major", - ) - ) - return ib.get() - - return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC}) - - -def intrin_wmma_gemm(shape): - n, m, l = shape - A = te.placeholder((n, l), name="A", dtype="float16") - B = te.placeholder((l, m), name="B", dtype="float16") - k = te.reduce_axis((0, l), name="k") - C = te.compute( - (n, m), - lambda ii, jj: te.sum(A[ii, k].astype("float") * B[k, jj].astype("float"), axis=k), - name="C", - ) - BA = tvm.tir.decl_buffer( - A.shape, A.dtype, name="BA", scope="wmma.matrix_a", data_alignment=32, offset_factor=n * l - ) - BB = tvm.tir.decl_buffer( - B.shape, B.dtype, name="BB", scope="wmma.matrix_b", data_alignment=32, offset_factor=l * m - ) - BC = tvm.tir.decl_buffer( - C.shape, - C.dtype, - name="BC", - scope="wmma.accumulator", - data_alignment=32, - offset_factor=n * m, - ) - - def intrin_func(ins, outs): - BA, BB = ins - (BC,) = outs - - def init(): - ib = tvm.tir.ir_builder.create() - ib.emit( - tvm.tir.call_intrin( - "handle", - "tir.tvm_fill_fragment", - BC.data, - n, - m, - l, - BC.elem_offset // (n * m), - 0.0, - ) - ) - return ib.get() - - def update(): - ib = tvm.tir.ir_builder.create() - ib.emit( - tvm.tir.call_intrin( - "handle", - "tir.tvm_mma_sync", - BC.data, - BC.elem_offset // (n * m), - BA.data, - BA.elem_offset // (n * l), - BB.data, - BB.elem_offset // (l * m), - BC.data, - BC.elem_offset // (n * m), - ) - ) - return ib.get() - - return update(), init(), update() - - return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC}) - - -def intrin_wmma_store_matrix(shape): - n, m, l = shape - A = te.placeholder((n, m), name="A", dtype="float32") - BA = tvm.tir.decl_buffer( - A.shape, A.dtype, scope="wmma.accumulator", data_alignment=32, offset_factor=n * m - ) - C = te.compute((n, m), lambda i, j: A[i, j], name="C") - BC = tvm.tir.decl_buffer( - C.shape, C.dtype, scope="global", data_alignment=32, offset_factor=n * m - ) - - def intrin_func(ins, outs): - ib = tvm.tir.ir_builder.create() - - BA = ins[0] - BC = outs[0] - ib.emit( - tvm.tir.call_intrin( - "handle", - "tir.tvm_store_matrix_sync", - BA.data, - n, - m, - l, - BA.elem_offset // (n * m), - BC.access_ptr("w"), - m, - "row_major", - ) - ) - return ib.get() - - return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC}) - - -@tvm.testing.requires_tensorcore -def test_tensor_core_batch_matmal(): - batch_size = 4 - n = 512 - m, l = n, n - assert n % 32 == 0 - assert m % 8 == 0 - assert l % 16 == 0 - nn, mm, ll = n // 32, m // 8, l // 16 - A = te.placeholder((batch_size, nn, ll, 32, 16), name="A", dtype="float16") - B = te.placeholder((batch_size, ll, mm, 16, 8), name="B", dtype="float16") - k1 = te.reduce_axis((0, ll), name="k1") - k2 = te.reduce_axis((0, 16), name="k2") - C = te.compute( - (batch_size, nn, mm, 32, 8), - lambda b, i, j, ii, jj: te.sum( - A[b, i, k1, ii, k2].astype("float") * B[b, k1, j, k2, jj].astype("float"), axis=[k1, k2] - ), - name="Fragment_C", - ) - s = te.create_schedule(C.op) - - warp_size = 32 - kernel_size = 16 - block_row_warps = 2 - block_col_warps = 4 - warp_row_tiles = 4 - warp_col_tiles = 2 - chunk = 4 - - block_x = te.thread_axis("blockIdx.x") - block_y = te.thread_axis("blockIdx.y") - block_z = te.thread_axis("blockIdx.z") - thread_x = te.thread_axis("threadIdx.x") - thread_y = te.thread_axis("threadIdx.y") - thread_z = te.thread_axis("threadIdx.z") - - AS = s.cache_read(A, "shared", [C]) - BS = s.cache_read(B, "shared", [C]) - AF = s.cache_read(AS, "wmma.matrix_a", [C]) - BF = s.cache_read(BS, "wmma.matrix_b", [C]) - CF = s.cache_write(C, "wmma.accumulator") - - b, i, j, kernel_i, kernel_j = s[C].op.axis - i, ii = s[C].split(i, factor=warp_row_tiles) - block_i, i = s[C].split(i, factor=block_row_warps) - j, jj = s[C].split(j, factor=warp_col_tiles) - block_j, j = s[C].split(j, factor=block_col_warps) - s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j) - s[C].bind(b, block_z) - s[C].bind(block_i, block_x) - s[C].bind(block_j, block_y) - s[C].bind(i, thread_y) - s[C].bind(j, thread_z) - - s[CF].compute_at(s[C], j) - b, warp_i, warp_j, _i, _j = s[CF].op.axis - k, _k = CF.op.reduce_axis - ko, ki = s[CF].split(k, factor=chunk) - s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k) - - s[AF].compute_at(s[CF], ki) - s[BF].compute_at(s[CF], ki) - - s[AS].compute_at(s[CF], ko) - b, xo, yo, xi, yi = AS.op.axis - tx, xo = s[AS].split(xo, nparts=block_row_warps) - ty, yo = s[AS].split(yo, nparts=block_col_warps) - t = s[AS].fuse(xi, yi) - to, ti = s[AS].split(t, nparts=warp_size) - s[AS].bind(tx, thread_y) - s[AS].bind(ty, thread_z) - s[AS].bind(to, thread_x) - - s[BS].compute_at(s[CF], ko) - b, xo, yo, xi, yi = BS.op.axis - tx, xo = s[BS].split(xo, nparts=block_row_warps) - ty, yo = s[BS].split(yo, nparts=block_col_warps) - t = s[BS].fuse(xi, yi) - to, ti = s[BS].split(t, nparts=warp_size) - s[BS].bind(tx, thread_y) - s[BS].bind(ty, thread_z) - s[BS].bind(to, thread_x) - - s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_a")) - s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_b")) - s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16))) - s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16))) - - func = tvm.build(s, [A, B, C], "cuda") - - dev = tvm.cuda(0) - a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype) - b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), dev) - func(a, b, c) - evaluator = func.time_evaluator(func.entry_name, dev, number=3) - print("gemm with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3)) - - if VERIFY: - func(a, b, c) - a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) - b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) - c_np = c.numpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n) - np.testing.assert_allclose( - c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4 - ) - - -@tvm.testing.requires_tensorcore -def test_tensor_core_batch_conv(): - # The sizes of inputs and filters - batch_size = 32 - height = 14 - width = 14 - in_channels = 32 - out_channels = 64 - kernel_h = 3 - kernel_w = 3 - pad_h = 1 - pad_w = 1 - stride_h = 1 - stride_w = 1 - block_size = 16 - - block_row_warps = 2 - block_col_warps = 4 - warp_row_tiles = 4 - warp_col_tiles = 2 - warp_size = 32 - chunk = 2 - - # Input feature map: (N, H, W, IC, n, ic) - data_shape = ( - batch_size // block_size, - height, - width, - in_channels // block_size, - block_size, - block_size, - ) - # Kernel: (H, W, IC, OC, ic, oc) - kernel_shape = ( - kernel_h, - kernel_w, - in_channels // block_size, - out_channels // block_size, - block_size, - block_size, - ) - - # Output feature map: (N, H, W, OC, n, oc) - output_shape = ( - batch_size // block_size, - height, - width, - out_channels // block_size, - block_size, - block_size, - ) - - assert batch_size % block_size == 0 - assert in_channels % block_size == 0 - assert out_channels % block_size == 0 - - kh = te.reduce_axis((0, kernel_h), name="kh") - kw = te.reduce_axis((0, kernel_w), name="kw") - ic = te.reduce_axis((0, in_channels // block_size), name="ic") - ii = te.reduce_axis((0, block_size), name="ii") - - # Algorithm - A = te.placeholder(data_shape, name="A", dtype="float16") - W = te.placeholder(kernel_shape, name="W", dtype="float16") - Apad = te.compute( - ( - batch_size // block_size, - height + 2 * pad_h, - width + 2 * pad_w, - in_channels // block_size, - block_size, - block_size, - ), - lambda n, h, w, i, nn, ii: tvm.tir.if_then_else( - tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width), - A[n, h - pad_h, w - pad_w, i, nn, ii], - tvm.tir.const(0.0, "float16"), - ), - name="Apad", - ) - Conv = te.compute( - output_shape, - lambda n, h, w, o, nn, oo: te.sum( - Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") - * W[kh, kw, ic, o, ii, oo].astype("float32"), - axis=[ic, kh, kw, ii], - ), - name="Conv", - ) - - s = te.create_schedule(Conv.op) - s[Apad].compute_inline() - - AS = s.cache_read(Apad, "shared", [Conv]) - WS = s.cache_read(W, "shared", [Conv]) - AF = s.cache_read(AS, "wmma.matrix_a", [Conv]) - WF = s.cache_read(WS, "wmma.matrix_b", [Conv]) - ConvF = s.cache_write(Conv, "wmma.accumulator") - - block_x = te.thread_axis("blockIdx.x") - block_y = te.thread_axis("blockIdx.y") - block_z = te.thread_axis("blockIdx.z") - thread_x = te.thread_axis("threadIdx.x") - thread_y = te.thread_axis("threadIdx.y") - thread_z = te.thread_axis("threadIdx.z") - - nc, hc, wc, oc, nnc, ooc = Conv.op.axis - block_k = s[Conv].fuse(hc, wc) - s[Conv].bind(block_k, block_z) - nc, nci = s[Conv].split(nc, factor=warp_row_tiles) - block_i, nc = s[Conv].split(nc, factor=block_row_warps) - oc, oci = s[Conv].split(oc, factor=warp_col_tiles) - block_j, oc = s[Conv].split(oc, factor=block_col_warps) - s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc) - s[Conv].bind(block_i, block_x) - s[Conv].bind(block_j, block_y) - s[Conv].bind(nc, thread_y) - s[Conv].bind(oc, thread_z) - - s[ConvF].compute_at(s[Conv], oc) - n, h, w, o, nnf, oof = ConvF.op.axis - ko, ki = s[ConvF].split(ic, factor=chunk) - s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii) - - s[AF].compute_at(s[ConvF], kw) - s[WF].compute_at(s[ConvF], kw) - - s[WS].compute_at(s[ConvF], kh) - s[AS].compute_at(s[ConvF], kh) - - n, h, w, i, nn, ii = AS.op.axis - tx, xo = s[AS].split(n, nparts=block_row_warps) - ty, yo = s[AS].split(xo, nparts=block_col_warps) - t = s[AS].fuse(nn, ii) - to, ti = s[AS].split(t, factor=warp_size) - s[AS].bind(tx, thread_y) - s[AS].bind(ty, thread_z) - s[AS].bind(ti, thread_x) - - kh, kw, ic, o, ii, oo = WS.op.axis - tx, xo = s[WS].split(o, nparts=block_row_warps) - ty, yo = s[WS].split(xo, nparts=block_col_warps) - t = s[WS].fuse(ii, oo) - to, ti = s[WS].split(t, nparts=warp_size) - s[WS].bind(tx, thread_y) - s[WS].bind(ty, thread_z) - s[WS].bind(to, thread_x) - s[WS].vectorize(ti) - - s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_a")) - s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_b")) - s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16))) - s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16))) - - func = tvm.build(s, [A, W, Conv], "cuda") - - dev = tvm.cuda(0) - a_np = np.random.uniform(size=data_shape).astype(A.dtype) - w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) - a = tvm.nd.array(a_np, dev) - w = tvm.nd.array(w_np, dev) - c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev) - evaluator = func.time_evaluator(func.entry_name, dev, number=3) - print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3)) - - if VERIFY: - func(a, w, c) - a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels) - w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape( - kernel_h, kernel_w, in_channels, out_channels - ) - c_np = ( - c.numpy().transpose((0, 4, 1, 2, 3, 5)).reshape(batch_size, height, width, out_channels) - ) - c_std = conv2d_nhwc_python( - a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w) - ).astype(Conv.dtype) - np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4) - - -if __name__ == "__main__": - test_tensor_core_batch_matmal() - test_tensor_core_batch_conv() diff --git a/tests/python/te/test_te_schedule_tensorize.py b/tests/python/te/test_te_schedule_tensorize.py deleted file mode 100644 index 419d3edb5c3d..000000000000 --- a/tests/python/te/test_te_schedule_tensorize.py +++ /dev/null @@ -1,392 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -from tvm import te -from tvm.script import tir as T - - -def intrin_vadd(xo, m, n): - x = te.placeholder((n,), name="vx") - y = te.placeholder((n,), name="vy") - if m % n == 0: - body = lambda i: x[i] + y[i] - else: - body = lambda i: tvm.tir.Select( - xo * n + i < m, x[i] + y[i], tvm.tir.const(0, dtype=x.dtype) - ) - z = te.compute(x.shape, body, name="z") - - def intrin_func(ins, outs): - xx, yy = ins - zz = outs[0] - # special handle needed to tackle tail loop part when m % n != 0 - # here is tvm.min(n, m - xo * n) - return tvm.tir.call_packed("vadd", xx, yy, zz) - - buffer_params = {"offset_factor": 16} - return te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params=buffer_params) - - -def intrin_gemv(m, n): - w = te.placeholder((m, n), name="w") - x = te.placeholder((n,), name="x") - k = te.reduce_axis((0, n), name="k") - z = te.compute((m,), lambda i: te.sum(w[i, k] * x[k], axis=k), name="z") - Wb = tvm.tir.decl_buffer( - w.shape, w.dtype, name="W", offset_factor=16, strides=[te.var("ldw"), 1] - ) - - def intrin_func(ins, outs): - ww, xx = ins - zz = outs[0] - ww_ptr = ww.access_ptr("r") - xx_ptr = xx.access_ptr("r") - zz_ptr = zz.access_ptr("w") - body = tvm.tir.call_packed("gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) - reset = tvm.tir.call_packed("fill_zero", zz_ptr, n) - update = tvm.tir.call_packed("gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) - return body, reset, update - - buffer_params = {"offset_factor": 16, "data_alignment": 16} - return te.decl_tensor_intrin( - z.op, intrin_func, binds={w: Wb}, default_buffer_params=buffer_params - ) - - -def intrin_gemv_no_reset(m, n): - w = te.placeholder((m, n), name="w") - x = te.placeholder((n,), name="x") - k = te.reduce_axis((0, n), name="k") - z = te.compute((m,), lambda i: te.sum(w[i, k] * x[k], axis=k), name="z") - Wb = tvm.tir.decl_buffer( - w.shape, w.dtype, name="W", offset_factor=16, strides=[te.var("ldw"), 1] - ) - - def intrin_func(ins, outs): - ww, xx = ins - zz = outs[0] - ww_ptr = ww.access_ptr("r") - xx_ptr = xx.access_ptr("r") - zz_ptr = zz.access_ptr("w") - body = tvm.tir.call_packed("gemv", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) - update = tvm.tir.call_packed("gemv_add", ww_ptr, xx_ptr, zz_ptr, n, ww.strides[0]) - return body, None, update - - buffer_params = {"offset_factor": 16, "data_alignment": 16} - return te.decl_tensor_intrin( - z.op, intrin_func, binds={w: Wb}, default_buffer_params=buffer_params - ) - - -def test_tensorize_vadd(): - def add(m): - x = te.placeholder((m,), name="x") - y = te.placeholder((m,), name="y") - z = te.compute(x.shape, lambda i: x[i] + y[i], name="z") - return x, y, z - - def check(m, factor): - x, y, z = add(m) - factor = T.int32(factor) - s = te.create_schedule(z.op) - xo, xi = s[z].split(z.op.axis[0], factor=factor) - vadd = intrin_vadd(xo, m, factor) - s[z].tensorize(xi, vadd) - s = s.normalize() - dom_map = tvm.te.schedule.InferBound(s) - finfer = tvm.get_global_func("test.op.InferTensorizeRegion") - out_dom, in_dom = finfer(s[z], dom_map) - tvm.ir.assert_structural_equal(out_dom[z.op.axis[0]].extent, factor) - tvm.ir.assert_structural_equal(out_dom[z.op.axis[0]].min, xo * factor) - tvm.ir.assert_structural_equal(in_dom.items()[0][1][0].extent, factor) - fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") - body = fmatch(s[z], out_dom, in_dom, vadd) - ana = tvm.arith.Analyzer() - tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(vadd.op.body[0])) - stmt = tvm.te.schedule.ScheduleOps(s, dom_map) - tvm.lower(s, [x, y, z]) - - def check_cache_write(m, factor): - x, y, z = add(m) - s = te.create_schedule(z.op) - _, _ = s[z].split(z.op.axis[0], factor=factor) - - z_global = s.cache_write(z, "global") - xo, xi = z_global.op.axis - - vadd = intrin_vadd(xo, m, factor) - s[z_global].tensorize(xi, vadd) - s = s.normalize() - dom_map = tvm.te.schedule.InferBound(s) - finfer = tvm.get_global_func("test.op.InferTensorizeRegion") - out_dom, in_dom = finfer(s[z_global], dom_map) - # outer loop var will be rebased, so min value is the new loop var and extent is 1 - tvm.ir.assert_structural_equal(out_dom[xo].extent, T.int32(1)) - assert isinstance(out_dom[xo].min, tvm.tir.Var) - assert xo.var.name == out_dom[xo].min.name - - fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") - body = fmatch(s[z_global], out_dom, in_dom, vadd)[0] - ana = tvm.arith.Analyzer() - vars = tvm.runtime.convert({xo.var: out_dom[xo].min}) - vadd_body = tvm.tir.stmt_functor.substitute(vadd.op.body[0], vars) - tvm.ir.assert_structural_equal(ana.simplify(body), ana.simplify(vadd_body)) - stmt = tvm.te.schedule.ScheduleOps(s, dom_map) - tvm.lower(s, [x, y, z]) - - def check_compute_reuse(): - x, y, z = add(32) - - def _intrin_vadd(): - def _intrin_func(ins, outs): - return tvm.tir.call_packed("vadd", ins[0], ins[1], outs[0]) - - return tvm.te.decl_tensor_intrin(z.op, _intrin_func) - - s = tvm.te.create_schedule(z.op) - s[z].tensorize(z.op.axis[0], _intrin_vadd()) - tvm.lower(s, [x, y, z]) - - check(128, 16) - check_cache_write(129, 16) - check_compute_reuse() - - -def test_tensorize_matmul(): - n = 1024 - m = n - l = n - A = te.placeholder((n, l), name="A") - B = te.placeholder((m, l), name="B") - k = te.reduce_axis((0, l), name="k") - C = te.compute((n, m), lambda i, j: te.sum(B[j, k] * A[i, k], axis=k), name="C") - - def check(factor): - s = te.create_schedule(C.op) - x, y = C.op.axis - yo, yi = s[C].split(y, factor=factor) - gemv = intrin_gemv(factor, l) - s[C].tensorize(yi, gemv) - s = s.normalize() - dom_map = tvm.te.schedule.InferBound(s) - finfer = tvm.get_global_func("test.op.InferTensorizeRegion") - out_dom, in_dom = finfer(s[C], dom_map) - tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1)) - tvm.ir.assert_structural_equal(out_dom[y].extent, factor) - tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor) - fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") - body = fmatch(s[C], out_dom, in_dom, gemv) - ana = tvm.arith.Analyzer() - - tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0])) - stmt = tvm.te.schedule.ScheduleOps(s, dom_map) - tvm.lower(s, [A, B, C]) - - def check_rfactor(factor, rfactor): - s = te.create_schedule(C.op) - x, y = C.op.axis - rk = C.op.reduce_axis[0] - yo, yi = s[C].split(y, factor=factor) - ro, ri = s[C].split(rk, factor=rfactor) - s[C].reorder(yo, ro, yi, ri) - gemv = intrin_gemv(factor, rfactor) - s[C].tensorize(yi, gemv) - s = s.normalize() - dom_map = tvm.te.schedule.InferBound(s) - finfer = tvm.get_global_func("test.op.InferTensorizeRegion") - out_dom, in_dom = finfer(s[C], dom_map) - tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1)) - tvm.ir.assert_structural_equal(out_dom[y].extent, factor) - tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor) - fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") - body = fmatch(s[C], out_dom, in_dom, gemv) - ana = tvm.arith.Analyzer() - tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0])) - stmt = tvm.te.schedule.ScheduleOps(s, dom_map) - tvm.lower(s, [A, B, C]) - - def check_rfactor_no_reset(factor, rfactor): - s = te.create_schedule(C.op) - x, y = C.op.axis - rk = C.op.reduce_axis[0] - yo, yi = s[C].split(y, factor=factor) - ro, ri = s[C].split(rk, factor=rfactor) - s[C].reorder(yo, ro, yi, ri) - gemv = intrin_gemv_no_reset(factor, rfactor) - s[C].tensorize(yi, gemv) - s = s.normalize() - dom_map = tvm.te.schedule.InferBound(s) - finfer = tvm.get_global_func("test.op.InferTensorizeRegion") - out_dom, in_dom = finfer(s[C], dom_map) - tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1)) - tvm.ir.assert_structural_equal(out_dom[y].extent, factor) - tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor) - fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") - body = fmatch(s[C], out_dom, in_dom, gemv) - ana = tvm.arith.Analyzer() - tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0])) - stmt = tvm.te.schedule.ScheduleOps(s, dom_map) - tvm.lower(s, [A, B, C]) - - def check_rfactor_no_reset_multi_reduction(factor, rfactor): - s = te.create_schedule(C.op) - x, y = C.op.axis - rk = C.op.reduce_axis[0] - yo, yi = s[C].split(y, factor=factor) - ro, ri = s[C].split(rk, factor=rfactor) - roo, roi = s[C].split(ro, factor=2) - s[C].reorder(yo, roo, roi, yi, ri) - gemv = intrin_gemv_no_reset(factor, rfactor) - s[C].tensorize(yi, gemv) - s = s.normalize() - dom_map = tvm.te.schedule.InferBound(s) - finfer = tvm.get_global_func("test.op.InferTensorizeRegion") - out_dom, in_dom = finfer(s[C], dom_map) - tvm.ir.assert_structural_equal(out_dom[x].extent, T.int32(1)) - tvm.ir.assert_structural_equal(out_dom[y].extent, factor) - tvm.ir.assert_structural_equal(out_dom[y].min, yo * factor) - fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") - body = fmatch(s[C], out_dom, in_dom, gemv) - ana = tvm.arith.Analyzer() - tvm.ir.assert_structural_equal(ana.simplify(body[0]), ana.simplify(gemv.op.body[0])) - stmt = tvm.te.schedule.ScheduleOps(s, dom_map) - tvm.lower(s, [A, B, C]) - - check(T.int32(16)) - check_rfactor(T.int32(16), T.int32(16)) - check_rfactor_no_reset(T.int32(16), T.int32(16)) - check_rfactor_no_reset_multi_reduction(T.int32(16), T.int32(16)) - - -# This tests whether algorithm and intrinsics expressions are simplified -# as much as possible first and then checked for equality. See Issue #696 -def test_tensorize_op(): - idxd = tvm.tir.indexdiv - idxm = tvm.tir.indexmod - - def op_intrin(): - bh = 9 - bw = 9 - x = te.placeholder((5, 5), name="A") - y = te.compute((bh, bw), lambda i, j: x[idxd(j, 3) + idxm(i, 3), idxm(j, 3) + idxd(i, 3)]) - - def intrin_func(ins, outs): - (xx,) = ins - zz = outs[0] - return tvm.tir.call_packed("op", xx, zz) - - return te.decl_tensor_intrin(y.op, intrin_func, default_buffer_params={"offset_factor": 2}) - - A = te.placeholder((5, 5), name="A") - B = te.compute((9, 9), lambda i, j: A[idxd(j, 3) + idxm(i, 3), idxm(j, 3) + idxd(i, 3)]) - bt = op_intrin() - s = te.create_schedule(B.op) - - x, y = B.op.axis - s[B].tensorize(x, bt) - s = s.normalize() - tvm.lower(s, [A, B]) - - -# This test asserts that tensorize does not have any effect on -# TensorComputeOp operations -def test_tensorize_tensor_compute_op(): - # an intrinsic called "multivadd" whose definition (pattern) - # is a loop of another intrinsic called "vadd" - def intrin_multivadd(n): - n_a = te.var("n_a") - Ab = tvm.tir.decl_buffer((n,), "float32", strides=[n_a]) - - n_b = te.var("n_b") - Bb = tvm.tir.decl_buffer((n,), "float32", strides=[n_b]) - - n_c = te.var("n_c") - Cb = tvm.tir.decl_buffer((n,), "float32", strides=[n_c]) - - z = te.compute( - (n,), - lambda i: tvm.tir.call_extern( - "float32", - "vadd", - Ab.access_ptr("w", offset=n_a * i), - Bb.access_ptr("r", offset=n_b * i), - Cb.access_ptr("r", offset=n_c * i), - ), - ) - - # replace the pattern with the multivadd call. I need to figure out - # how to pass it the right parameters. - def intrin_func(ins, outs): - return tvm.tir.call_packed("multivadd") - - return te.decl_tensor_intrin(z.op, intrin_func, name="multivadd") - - def intrin_vadd(n): - dtype = "float32" - x = te.placeholder((n,), dtype=dtype, name="vx") - y = te.placeholder((n,), dtype=dtype, name="vy") - z = te.compute(x.shape, lambda i: x[i] + y[i], name="z") - s = te.create_schedule(z.op) - - def create_buffer(t): - return tvm.tir.decl_buffer(t.shape, t.dtype, name="W" + t.name, offset_factor=16) - - def intrin_func(ins, outs): - ib = tvm.tir.ir_builder.create() - ib.emit( - tvm.tir.call_extern( - "float32", - "vadd", - ins[0].access_ptr("r"), - ins[1].access_ptr("r"), - outs[0].access_ptr("wr"), - ) - ) - return ib.get() - - return te.decl_tensor_intrin( - z.op, intrin_func, binds={x: create_buffer(x), y: create_buffer(y), z: create_buffer(z)} - ) - - # cache_read, cache_write - M = 1024 - factor = 16 - dtype = "float32" - - A = te.placeholder((M // factor, factor), name="A", dtype=dtype) - B = te.placeholder((M // factor, factor), name="B", dtype=dtype) - - vadd = intrin_vadd(factor) - C = te.compute((M // factor, factor), lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name="C") - - s = te.create_schedule(C.op) - multivadd = intrin_multivadd(64) - s[C].tensorize(C.op.axis[0], multivadd) - s = s.normalize() - dom_map = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, dom_map) - # The loop that we tried to tensorize still exists in the code - # That means tensorize didn't work as expected - assert isinstance(stmt.body, tvm.tir.For) - assert stmt.body.loop_var.name == C.op.axis[0].var.name - - -if __name__ == "__main__": - test_tensorize_vadd() - test_tensorize_matmul() - test_tensorize_op() - test_tensorize_tensor_compute_op() diff --git a/tests/python/te/test_te_tensor.py b/tests/python/te/test_te_tensor.py index 6958888e9bb6..31d6b1f4eb3a 100644 --- a/tests/python/te/test_te_tensor.py +++ b/tests/python/te/test_te_tensor.py @@ -128,91 +128,6 @@ def fidentity(t0, t1): T0, T1 = te.compute((m,), lambda i: mysum((idx[i, k], val[i, k]), axis=k, where=cond), name="T") -def test_tensor_compute1(): - m = 1024 - factor = 16 - dtype = "float32" - - def intrin_vadd(n): - x = te.placeholder((n,)) - y = te.placeholder((n,)) - z = te.compute(x.shape, lambda i: x[i] + y[i]) - - def intrin_func(ins, outs): - ib = tvm.tir.ir_builder.create() - ib.emit( - tvm.tir.call_extern( - outs[0].dtype, - "vadd", - ins[0].access_ptr("r"), - ins[1].access_ptr("r"), - outs[0].access_ptr("wr"), - ) - ) - return ib.get() - - return te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params={"offset_factor": n}) - - vadd = intrin_vadd(factor) - - A = te.placeholder((m // factor, factor), name="A", dtype=dtype) - B = te.placeholder((m // factor, factor), name="B", dtype=dtype) - C = te.compute((m // factor, factor), lambda i: vadd(A[i, 0:factor], B[i, 0:factor])) - - s = te.create_schedule(C.op) - # check lowering with the CSE pass disabled as otherwise it would do some commoning - with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): - stmt = tvm.lower(s, [A, B, C])["main"].body - assert isinstance(stmt.body, tvm.tir.Evaluate) - - -def test_tensor_compute2(): - M = 2048 - N = 1024 - L = 1024 - factor = 16 - factor1 = 32 - factor2 = 32 - dtype = "float32" - - def intrin_gemm(m, n, l): - k = te.reduce_axis((0, l)) - x = te.placeholder((m, l)) - y = te.placeholder((n, l)) - # in theory, no relation - z = te.compute((m, n), lambda i, j: te.sum(x[i][k] * y[j][k], axis=k)) - - def intrin_func(ins, outs): - x_ptr = ins[0].access_ptr("r") - y_ptr = ins[1].access_ptr("r") - z_ptr = outs[0].access_ptr("w") - body = tvm.tir.call_packed("gemv", x_ptr, y_ptr, z_ptr, m, n, l) - reset = tvm.tir.call_packed("fill_zero", z_ptr, m, n) - update = tvm.tir.call_packed("gemv_add", x_ptr, y_ptr, z_ptr, m, n, l) - return body, reset, update - - return te.decl_tensor_intrin(z.op, intrin_func, default_buffer_params={"offset_factor": n}) - - vgemm = intrin_gemm(factor1, factor2, factor) - - A = te.placeholder((M // factor1, L // factor, factor1, factor), name="A", dtype=dtype) - B = te.placeholder((N // factor2, L // factor, factor2, factor), name="B", dtype=dtype) - k = te.reduce_axis((0, L // factor), name="k") - C = te.compute( - (M // factor1, N // factor2, factor1, factor2), - lambda i, j: vgemm( - A[i, k, 0:factor1, 0:factor], B[j, k, 0:factor2, 0:factor], reduce_axis=k - ), - ) - - s = te.create_schedule(C.op) - # check lowering with the CSE pass disabled as otherwise it would do some commoning - with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): - stmt = tvm.lower(s, [A, B, C])["main"].body - assert isinstance(stmt.body.body[0], tvm.tir.Evaluate) - assert isinstance(stmt.body.body[1].body, tvm.tir.Evaluate) - - def test_tensor_scan(): m = te.size_var("m") n = te.size_var("n") @@ -251,7 +166,7 @@ def test_extern(): A = te.placeholder((m,), name="A") def extern_func(ins, outs): - assert isinstance(ins[0], tvm.te.schedule.Buffer) + assert isinstance(ins[0], tvm.tir.Buffer) return tvm.tir.call_packed("myadd", ins[0].data, outs[0].data, m) B = te.extern((m,), [A], extern_func) @@ -264,7 +179,7 @@ def test_extern_multi_out(): B = te.compute((m,), lambda i: A[i] * 10) def extern_func(ins, outs): - assert isinstance(ins[0], tvm.te.schedule.Buffer) + assert isinstance(ins[0], tvm.tir.Buffer) return tvm.tir.call_packed("myadd", ins[0].data, outs[0].data, outs[1].data, m) res = te.extern([A.shape, A.shape], [A, B], extern_func) @@ -278,13 +193,7 @@ def test_tuple_inputs(): A0 = te.placeholder((m, n), name="A0") A1 = te.placeholder((m, n), name="A1") T0, T1 = te.compute((m, n), lambda i, j: (A0[i, j] * 2, A1[i, j] * 3), name="T") - s = te.create_schedule(T0.op) - - for i in range(len(T0.shape)): - assert T0.shape[i] == T1.shape[i] - assert T0.op == T1.op - assert T0.value_index == 0 - assert T1.value_index == 1 + s = te.create_prim_func([A0, A1, T0]) def test_tuple_with_different_deps(): @@ -295,25 +204,7 @@ def test_tuple_with_different_deps(): B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] * 2, A1[i, j] * 3), name="B") C = te.compute((m, n), lambda i, j: B0[i, j] + 4, name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=10) - s[B0.op].compute_at(s[C], xo) - sch = s.normalize() - bounds = tvm.te.schedule.InferBound(sch) - stmt = tvm.te.schedule.ScheduleOps(sch, bounds) - - def get_B1_realize(x): - if ( - isinstance(x, tvm.tir.ProducerRealize) - and x.producer.op == B1.op - and x.producer.value_index == 1 - ): - ret.append(x) - - ret = [] - tvm.tir.stmt_functor.post_order_visit(stmt, get_B1_realize) - - assert stmt.producer == C and len(ret) == 1 + te.create_prim_func([A0, A1, C]) def test_tensor_inputs(): @@ -322,91 +213,6 @@ def test_tensor_inputs(): assert tuple(y.op.input_tensors) == (x,) -def test_tensor_pool(): - def intrin_pool(): - A = te.placeholder((64, 16, 16), name="A") - kh = te.reduce_axis((0, 3), name="kh") - kw = te.reduce_axis((0, 3), name="kw") - P = te.compute( - (64, 14, 14), - lambda c, oh, ow: tvm.te.max(A[c, oh + kh, ow + kw], axis=[kh, kw]), - name="p", - ) - - def intrin_func(ins, outs): - dinp = ins[0] - dout = outs[0] - return tvm.tir.call_packed("op", dinp, dout) - - return te.decl_tensor_intrin(P.op, intrin_func, default_buffer_params={"offset_factor": 1}) - - A = te.placeholder((1, 64, 16, 16), name="A") - P = pool2d( - data=A, kernel=(3, 3), stride=(1, 1), dilation=(1, 1), padding=(0, 0, 0, 0), pool_type="max" - ) - s = te.create_schedule(P.op) - _, oh, _, _ = P.op.axis - intrin = intrin_pool() - s[P].tensorize(oh, intrin) - tvm.lower(s, [A, P]) - - -def test_tensor_scalar_mixed(): - # test te with tensor and scalar - a = np.array(np.random.uniform(size=(10,)), "float32") - b = np.array(np.random.uniform(size=(1))[0], "float32") - c = np.array(np.random.uniform(size=(10,)), "float32") - - @tvm.register_func("tvm.test_tensor_scalar_scale") - def my_scale(tensor, scalar, out): - out_np = tensor.numpy() * scalar.numpy() - tvm.nd.array(out_np).copyto(out) - - A = te.placeholder(a.shape, name="A") - B = te.placeholder(b.shape, name="B") - C = te.extern( - a.shape, - [A, B], - lambda ins, outs: tvm.tir.call_packed( - "tvm.test_tensor_scalar_scale", ins[0], ins[1], outs[0] - ), - name="C", - ) - s = te.create_schedule(C.op) - f = tvm.build(s, [A, B, C], "llvm") - - ta = tvm.nd.array(a) - tb = tvm.nd.array(b) - tc = tvm.nd.array(c) - f(ta, tb, tc) - tvm.testing.assert_allclose(a * b, tc.numpy()) - - -def test_tensor_scalar(): - # test te with scalar shape - a = np.array(np.random.uniform(size=(1))[0], "float32") - b = np.array(0.0, "float32") - - @tvm.register_func("tvm.test_tensor_scalar_copy") - def mycopy(x, y): - x.copyto(y) - - A = te.placeholder(a.shape, name="A") - B = te.extern( - a.shape, - [A], - lambda ins, outs: tvm.tir.call_packed("tvm.test_tensor_scalar_copy", ins[0], outs[0]), - name="B", - ) - s = te.create_schedule(B.op) - f = tvm.build(s, [A, B], "llvm") - - ta = tvm.nd.array(a) - tb = tvm.nd.array(b) - f(ta, tb) - tvm.testing.assert_allclose(ta.numpy(), tb.numpy()) - - if __name__ == "__main__": test_tensor() test_rank_zero() @@ -426,6 +232,3 @@ def mycopy(x, y): test_tuple_inputs() test_tuple_with_different_deps() test_tensor_inputs() - test_tensor_pool() - test_tensor_scalar_mixed() - test_tensor_scalar() diff --git a/tests/python/te/test_te_transform_layout.py b/tests/python/te/test_te_transform_layout.py deleted file mode 100644 index 375fe4a24d57..000000000000 --- a/tests/python/te/test_te_transform_layout.py +++ /dev/null @@ -1,592 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import functools -import sys -import pytest - -import numpy as np - -import tvm -import tvm.testing -from tvm import te -from tvm.tir.stmt_functor import post_order_visit -from tvm.driver.build_module import schedule_to_module - -dtype = tvm.testing.parameter("int32") - - -def flatten_all_indices(preflatten_shape): - def mapping(*indices): - output = 0 - for index, size in zip(indices, preflatten_shape): - output = output * size + index - return [output] - - return mapping - - -def unpack_flattened_indices(preflatten_shape): - def mapping(i): - output = [] - for dim in reversed(preflatten_shape): - output.append(i % dim) - i //= dim - return output[::-1] - - return mapping - - -def traverse(s, op, callback): - visited = set() - - def _traverse(op): - if op in visited: - return - visited.add(op) - for tensor in op.input_tensors: - _traverse(tensor.op) - callback(op) - - _traverse(op) - - -class TestCompareAgainstExplicitReshape: - A_definition_style = tvm.testing.parameter( - "explicit_reshape", - "transform_layout", - ) - B_definition_style = tvm.testing.parameter( - "explicit_reshape", - "transform_layout", - ) - - reordered_shape = tvm.testing.parameter((2, 3, 4)) - - @tvm.testing.fixture - def n_items(self, reordered_shape): - return functools.reduce(lambda x, y: x * y, reordered_shape, 1) - - @tvm.testing.fixture - def fphysical_layout(self, reordered_shape): - return unpack_flattened_indices(reordered_shape) - - @tvm.testing.fixture - def fcompute(self, A_definition_style, B_definition_style, reordered_shape, n_items, dtype): - assert A_definition_style in ["explicit_reshape", "transform_layout"] - assert B_definition_style in ["explicit_reshape", "transform_layout"] - - def func(): - if A_definition_style == "explicit_reshape": - A_input = te.placeholder(shape=reordered_shape, name="A_input", dtype=dtype) - A = te.compute( - shape=(n_items,), - fcompute=lambda i: A_input[ - i // (reordered_shape[1] * reordered_shape[2]), - (i // reordered_shape[2]) % reordered_shape[1], - i % reordered_shape[2], - ], - name="A", - ) - - elif A_definition_style == "transform_layout": - A = te.placeholder(shape=(n_items,), name="A", dtype=dtype) - A_input = A - - B = te.compute(shape=A.shape, fcompute=lambda i: A[i], name="B") - - if B_definition_style == "explicit_reshape": - B_output = te.compute( - shape=reordered_shape, - fcompute=lambda i, j, k: B[ - i * reordered_shape[1] * reordered_shape[2] + j * reordered_shape[2] + k - ], - name="B_output", - ) - elif B_definition_style == "transform_layout": - B_output = B - - return A_input, B_output - - return func - - @tvm.testing.fixture - def fschedule(self, A_definition_style, B_definition_style, fphysical_layout): - def func(outs): - outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs - s = te.create_schedule([x.op for x in outs]) - - def callback(op): - if (op.name == "A" and A_definition_style == "transform_layout") or ( - op.name == "B" and B_definition_style == "transform_layout" - ): - s[op].transform_layout(fphysical_layout) - - traverse(s, outs[0].op, callback) - return s - - return func - - @tvm.testing.parametrize_targets("llvm") - def test_external_reshape( - self, target, dev, fcompute, fschedule, n_items, reordered_shape, dtype - ): - A, B = fcompute() - s = fschedule(B) - - func = tvm.build(s, [A, B], target=target, name="copy_reshape") - - a_np = np.arange(n_items).reshape(reordered_shape).astype(dtype) - b_np = np.arange(n_items).reshape(reordered_shape).astype(dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.empty(b_np.shape, dtype=dtype, device=dev) - - func(a, b) - - tvm.testing.assert_allclose(b.numpy(), b_np) - - @tvm.testing.parametrize_targets("llvm") - def test_internal_reshape(self, target, dev, n_items, reordered_shape, dtype, fphysical_layout): - # The reshaping of the buffer gets flattened away in - # StorageFlatten. Therefore, testing the behavior by running only - # ApplyLayoutTransforms. - logical_shape = (n_items,) - A = te.placeholder(logical_shape, name="A", dtype=dtype) - B = te.compute(shape=logical_shape, fcompute=lambda i: A[i], name="B") - C = te.compute(shape=logical_shape, fcompute=lambda i: B[i], name="C") - - s = te.create_schedule(C.op) - s[B].transform_layout(fphysical_layout) - - mod = schedule_to_module(s, [A, C]) - body = mod["main"].body - - def walk_buffer_interactions(stmt, callback): - buffer_classes = [ - tvm.tir.BufferLoad, - tvm.tir.BufferStore, - tvm.tir.BufferRealize, - ] - - def inner(node): - if (type(node) in buffer_classes) and node.buffer.name == "B": - callback(node) - - post_order_visit(stmt, inner) - - # All references to the buffer are the same object - def check_references(): - buffer_object = None - - def inner(node): - nonlocal buffer_object - if buffer_object is None: - buffer_object = node.buffer - else: - assert node.buffer.same_as(buffer_object) - - return inner - - # The buffer has the expected shape. - def check_shape(expected_shape): - def inner(node): - assert tuple(node.buffer.shape) == expected_shape - - return inner - - # Before the transform, the buffer should be in the logical shape. - walk_buffer_interactions(body, check_references()) - walk_buffer_interactions(body, check_shape(logical_shape)) - - mod = tvm.tir.transform.ApplyLayoutTransforms()(mod) - body = mod["main"].body - - # After the transform, the buffer should be in the physical shape. - walk_buffer_interactions(body, check_references()) - walk_buffer_interactions(body, check_shape(reordered_shape)) - - -class Test2DPhysicalLayout: - transform_A = tvm.testing.parameter( - "1d_A", - "2d_A", - "2d_rev_A", - "3d_A", - ) - transform_B = tvm.testing.parameter( - "1d_B", - "2d_B", - "2d_rev_B", - "3d_B", - ) - - @staticmethod - def extract_logical_indices(stmt): - output = {} - - # Since the for loops can be reordered by the layout - # transformation, identify the loop corresponding to each - # pre-transformation axis based on the iteration extent. - def callback(node): - if isinstance(node, tvm.tir.For): - output[node.loop_var] = node.extent.value - - post_order_visit(stmt, callback) - return sorted(output, key=output.get) - - def get_transform(self, name): - name = name[:-2] - if name == "1d": - return None - elif name == "2d": - return lambda i, j, k: [i, j, te.AXIS_SEPARATOR, k] - elif name == "2d_rev": - return lambda i, j, k: [k, j, te.AXIS_SEPARATOR, i] - elif name == "3d": - return lambda i, j, k: [i, te.AXIS_SEPARATOR, j, te.AXIS_SEPARATOR, k] - else: - raise ValueError(f"Unknown transformation: {name}") - - def transform_indices(self, name, logical_shape, logical_index_vars): - name = name[:-2] - - i, j, k = logical_index_vars - - if name == "1d": - return [i * (logical_shape[1] * logical_shape[2]) + j * logical_shape[2] + k] - elif name == "2d": - return [i * logical_shape[1] + j, k] - elif name == "2d_rev": - return [k * logical_shape[1] + j, i] - elif name == "3d": - return [i, j, k] - else: - raise ValueError(f"Unknown transformation: {name}") - - def test_2d_physical(self, dtype, transform_A, transform_B): - logical_shape = (2, 3, 4) - A = te.placeholder(shape=logical_shape, dtype=dtype, name="A") - B = te.compute(shape=A.shape, fcompute=lambda i, j, k: A[i, j, k], name="B") - - s = te.create_schedule(B.op) - - func = self.get_transform(transform_A) - if func: - s[A].transform_layout(func) - - func = self.get_transform(transform_B) - if func: - s[B].transform_layout(func) - - # If the two buffers are accessed with the same indices, CSE - # will replace them with a Let binding. Since this makes it - # harder to test what the transformed indices are, disabling - # the CSE pass for this test. - with tvm.transform.PassContext(disabled_pass=["tir.CommonSubexprElimTIR"]): - mod = tvm.lower(s, [A, B]) - - logical_index_vars = self.extract_logical_indices(mod["main"].body) - expected_indices_A = self.transform_indices(transform_A, logical_shape, logical_index_vars) - expected_indices_B = self.transform_indices(transform_B, logical_shape, logical_index_vars) - - def callback(node): - if type(node) in [tvm.tir.BufferLoad, tvm.tir.BufferStore]: - name = node.buffer.name - if name == "A": - expected_indices = expected_indices_A - elif name == "B": - expected_indices = expected_indices_B - else: - raise RuntimeError(f"Unexpected buffer: {name}") - - tvm.ir.assert_structural_equal(expected_indices, node.indices) - - post_order_visit(mod["main"].body, callback) - - -class TestTransformedSchedules: - logical_shape = tvm.testing.parameter((4, 6, 40)) - - transform_names = [ - None, - "reverse", - "flatten_all", - "factor_last_by_4", - ] - - transform_A = tvm.testing.parameter(by_dict={f"A_{t}": t for t in transform_names}) - transform_B = tvm.testing.parameter( - by_dict={f"B_{t}": t for t in transform_names if t is not None} - ) - - after_transform = tvm.testing.parameter(None) - - def make_transform(self, logical_shape, transform_name): - if transform_name is None: - return lambda *indices: indices - elif transform_name == "reverse": - return lambda *indices: indices[::-1] - elif transform_name == "flatten_all": - return flatten_all_indices(logical_shape) - elif transform_name == "factor_last_by_4": - return lambda *indices, n: [*indices, n // 4, n % 4] - else: - raise NotImplementedError(f"Unknown transformation {transform_name}") - - def make_transformed_shape(self, logical_shape, transform_name): - if transform_name is None: - return logical_shape - elif transform_name == "reverse": - return logical_shape[::-1] - elif transform_name == "flatten_all": - num_elements = functools.reduce(lambda x, y: x * y, logical_shape, 1) - return [num_elements] - elif transform_name == "factor_last_by_4": - *indices, n = logical_shape - return [*indices, n // 4, 4] - else: - raise NotImplementedError(f"Unknown transformation {transform_name}") - - @tvm.testing.fixture - def expected_loop_order(self, logical_shape, transform_B, after_transform): - shape = self.make_transformed_shape(logical_shape, transform_B) - - if after_transform == "reorder": - shape = shape[::-1] - - elif after_transform == "split": - shape = [ - *shape[:-1], - 2, - shape[-1] // 2, - ] - - elif after_transform == "fuse": - fused_size = shape[0] if transform_B == "flatten_all" else shape[0] * shape[1] - shape = [fused_size, *shape[2:]] - - return shape - - @tvm.testing.fixture - def schedule(self, logical_shape, dtype, transform_A, transform_B, after_transform): - A = te.placeholder(shape=logical_shape, dtype=dtype, name="A") - B = te.compute(shape=A.shape, fcompute=lambda i, j, k: A[i, j, k], name="B") - - s = te.create_schedule(B.op) - - if transform_A: - s[A].transform_layout(self.make_transform(logical_shape, transform_A)) - - iter_vars = s[B].transform_layout(self.make_transform(logical_shape, transform_B)) - iter_vars = list(iter_vars) - - if after_transform == "reorder": - s[B].reorder(*iter_vars[::-1]) - - elif after_transform == "split": - s[B].split(iter_vars[-1], nparts=2) - - elif after_transform == "fuse": - to_fuse = iter_vars[:2] - s[B].fuse(*iter_vars[:2]) - - return { - "schedule": s, - "tensors": [A, B], - "iter_vars": iter_vars, - } - - def compare_tir_loop_order(self, stmt, expected_loop_order): - def collect_loops(node): - output = [] - - def callback(node): - if isinstance(node, tvm.tir.For): - output.append(node) - - post_order_visit(node, callback) - return output[::-1] - - loops = collect_loops(stmt) - loop_order = [loop.extent for loop in loops] - - np.testing.assert_array_equal(loop_order, expected_loop_order) - - def test_tir_loop_order(self, schedule, expected_loop_order): - func = tvm.lower(schedule["schedule"], schedule["tensors"])["main"] - self.compare_tir_loop_order(func.body, expected_loop_order) - - def test_te_loop_order(self, schedule, expected_loop_order): - s = schedule["schedule"] - A, B = schedule["tensors"] - iter_vars = schedule["iter_vars"] - - # No reduction axis, so all leaf_iter_vars are over the data - # array, and should have the new iteration variables. - extents = [int(iter_var.dom.extent) for iter_var in s[B].leaf_iter_vars] - np.testing.assert_array_equal(extents, expected_loop_order) - - # layout_transform should return the new iteration variables. - extents = [int(iter_var.dom.extent) for iter_var in iter_vars] - np.testing.assert_array_equal(extents, expected_loop_order) - - @pytest.mark.parametrize("after_transform", ["reorder", "split", "fuse"]) - def test_use_transformed_axes( - self, schedule, expected_loop_order, transform_A, transform_B, after_transform - ): - s = schedule["schedule"] - A, B = schedule["tensors"] - - func = tvm.lower(s, [A, B])["main"] - self.compare_tir_loop_order(func.body, expected_loop_order) - - -class TestTransformCache: - A_size = tvm.testing.parameter(16) - - transform_A = tvm.testing.parameter(by_dict={"transformA": True, "": False}) - transform_B = tvm.testing.parameter(by_dict={"transformB": True, "": False}) - cache_A = tvm.testing.parameter(by_dict={"cacheA": True, "": False}) - cache_B = tvm.testing.parameter(by_dict={"cacheB": True, "": False}) - - @tvm.testing.fixture - def schedule_args(self, target, A_size, transform_A, transform_B, cache_A, cache_B, dtype): - A = te.placeholder(shape=[A_size], dtype=dtype, name="A") - B = te.compute(A.shape, lambda i: A[i], name="B") - s = te.create_schedule(B.op) - - requires_thread_bind = "gpu" in tvm.target.Target(target).keys - thread_x = te.thread_axis("threadIdx.x") - thread_y = te.thread_axis("threadIdx.y") - thread_z = te.thread_axis("threadIdx.z") - - if cache_A: - AA = s.cache_read(A, "shared", [B]) - if requires_thread_bind: - s[AA].bind(AA.op.axis[0], thread_x) - - if cache_B: - BB = s.cache_write(B, "shared") - if requires_thread_bind: - s[BB].bind(BB.op.axis[0], thread_y) - - if transform_A: - A_axis = s[A].transform_layout(lambda i: [i // 4, i % 4]) - - if transform_B: - B_axis = s[B].transform_layout(lambda i: [i // 4, i % 4]) - else: - B_axis = B.op.axis - - if requires_thread_bind: - s[B].bind(B_axis[0], thread_z) - - return [s, [A, B]] - - @tvm.testing.fixture - def ref_data(self, A_size, dtype, transform_A, transform_B): - a_np = (100 * np.random.uniform(size=A_size)).astype(dtype) - b_np = a_np - - if transform_A: - a_np = a_np.reshape((-1, 4)) - - if transform_B: - b_np = b_np.reshape((-1, 4)) - - return a_np, b_np - - def test_lower(self, schedule_args): - tvm.lower(*schedule_args) - - def test_execute(self, target, dev, schedule_args, ref_data, dtype): - func = tvm.build(*schedule_args, target=target) - - a_np, b_np = ref_data - a = tvm.nd.array(a_np, dev) - b = tvm.nd.empty(b_np.shape, dtype=dtype, device=dev) - - func(a, b) - - if "int" in dtype: - np.testing.assert_equal(b.numpy(), b_np) - else: - tvm.testing.assert_allclose(b.numpy(), b_np) - - -def test_transform_with_reduction(): - # To trigger this failure mode, the computation must use a - # reduction axis, - A = te.placeholder([16, 32, 64], dtype="float32", name="A") - k = te.reduce_axis((0, A.shape[-1]), name="k") - B = te.compute(A.shape[:-1], lambda i, j: te.sum(A[i, j, k], axis=[k])) - s = te.create_schedule(B.op) - - # And the output of the computation must have a layout - # transformation applied. - s[B].transform_layout(lambda i, j: [j, i]) - - # When present, the failure occurred during tvm.lower, during the - # call to `tvm::te::PassDownBitMaskOr`. - tvm.lower(s, [A, B]) - - -shape, transform = tvm.testing.parameters( - ([1, 8], lambda n, i: [i, n]), - ([1, 1, 8], lambda i, j, k: [j, te.AXIS_SEPARATOR, i, k]), - ([1, 1, 8], lambda i, j, k: [i, te.AXIS_SEPARATOR, j, k]), -) - - -def test_size_one_buffer(shape, transform): - # This test is to catch a failure mode that occurred if a - # transformation were applied to a te.compute buffer, and one of - # the dimensions of the buffer was 1. Prior to bugfix, - # arith::DetectIterMap would fold the variable as a constant, - # causing an error when attempting to solve for the variable using - # arith::InverseAffineIterMap. - - dtype = "int8" - A = te.placeholder(shape, dtype, name="A") - B = te.compute( - shape=A.shape, - fcompute=lambda *indices: A[indices].astype(dtype), - name="B", - ) - s = te.create_schedule(B.op) - - # If layout transformation is on the output buffer, and any - # dimension of the output buffer is 1, failure occurs in - # CheckFusePattern. - s[B].transform_layout(transform) - - -def test_non_divisible_transform_raises_error(): - A = te.placeholder([1, 3, 8, 8]) - B = te.compute(A.shape, lambda *indices: A[indices]) - s = te.create_schedule(B.op) - - transform = lambda n, c, h, w: [n, c // 4, h, w, c % 4] - # Error occurs here, because the transformation would introduce - # padding. Padded transforms are supported in TIR-based - # schedules. - with pytest.raises(tvm.TVMError): - s[B].transform_layout(transform) - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/tir-analysis/test_tir_analysis_usedef.py b/tests/python/tir-analysis/test_tir_analysis_usedef.py deleted file mode 100644 index 940355e1415c..000000000000 --- a/tests/python/tir-analysis/test_tir_analysis_usedef.py +++ /dev/null @@ -1,36 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import pytest -import tvm -from tvm import te - - -@pytest.mark.xfail -def test_loop_dependent_allocate(): - N = te.size_var("N") - A = te.placeholder((2 * N,), "float32", "A") - C = te.compute((N,), lambda i: A[2 * i] + A[i + 1], name="C") - s = te.create_schedule(C.op) - AA = s.cache_read(A, "local", [C]) - s[AA].compute_at(s[C], s[C].op.axis[0]) - # this line should fail due to IRUseDefAnalysis sees an allocate statement - # referencing undefined variable - tvm.lower(s, [A, C]) - - -if __name__ == "__main__": - test_loop_dependent_allocate() diff --git a/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py b/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py deleted file mode 100644 index 45a8a8138bd5..000000000000 --- a/tests/python/tir-analysis/test_tir_analysis_verify_gpu_code.py +++ /dev/null @@ -1,434 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Test gpu code verifier""" -import tvm -from tvm import te -from tvm import topi -import tvm.testing - - -def get_verify_pass(valid, **kwargs): - def _fverify(f, *_): - valid[0] = tvm.tir.analysis.verify_gpu_code(f, kwargs) - return f - - return tvm.tir.transform.prim_func_pass(_fverify, opt_level=0) - - -@tvm.testing.requires_gpu -def test_shared_memory(): - def check_shared_memory(storage_scope, dtype): - N = 1024 - M = 128 - - tvm_type = tvm.runtime.DataType(dtype) - type_size = tvm_type.bits // 8 * tvm_type.lanes - - A = te.placeholder((N,), name="A", dtype=dtype) - B = te.compute((N,), lambda i: A[i], name="B") - - s = te.create_schedule([B.op]) - AA = s.cache_read(A, storage_scope, [B]) - o, i = s[B].split(s[B].op.axis[0], M) - s[AA].compute_at(s[B], o) - s[B].bind(o, te.thread_axis("blockIdx.x")) - s[B].bind(i, te.thread_axis("threadIdx.x")) - - # shared memory usage: M * sizeof(dtype) Bytes - # thread usage: M - - for target in ["opencl", "cuda"]: - if not tvm.testing.device_enabled(target): - continue - valid = [None] - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [ - ( - 2, - get_verify_pass( - valid, - max_shared_memory_per_block=type_size * M - 1, - max_threads_per_block=M, - ), - ) - ] - } - ): - tvm.build(s, [A, B], target) - assert not valid[0] - - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [ - ( - 2, - get_verify_pass( - valid, - max_shared_memory_per_block=type_size * M, - max_threads_per_block=M, - ), - ) - ] - } - ): - tvm.build(s, [A, B], target) - assert valid[0] - - check_shared_memory("shared", "float32") - check_shared_memory("shared", "int8x4") - check_shared_memory("shared.dyn", "float32") - - -@tvm.testing.requires_gpu -def test_local_memory(): - N = 1024 - M = 128 - - A = te.placeholder((N,), name="A", dtype="float32") - B = te.compute((N,), lambda i: A[i], name="B") - - s = te.create_schedule([B.op]) - AA = s.cache_read(A, "local", [B]) - o, i = s[B].split(s[B].op.axis[0], M) - s[AA].compute_at(s[B], o) - s[B].bind(o, te.thread_axis("blockIdx.x")) - - # local memory usage: M * 4B - # thread usage: M - - for target in ["opencl", "cuda"]: - if not tvm.testing.device_enabled(target): - continue - - valid = [None] - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [ - ( - 2, - get_verify_pass( - valid, max_local_memory_per_block=4 * M - 1, max_threads_per_block=1 - ), - ) - ] - } - ): - tvm.build(s, [A, B], target) - assert not valid[0] - - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [ - ( - 2, - get_verify_pass( - valid, max_local_memory_per_block=4 * M, max_threads_per_block=1 - ), - ) - ] - } - ): - tvm.build(s, [A, B], target) - assert valid[0] - - -@tvm.testing.requires_gpu -def test_num_thread(): - N = 1024 - M = 128 - - A = te.placeholder((N,), name="A", dtype="float32") - B = te.compute((N,), lambda i: A[i], name="B") - - s = te.create_schedule([B.op]) - o, i = s[B].split(s[B].op.axis[0], M) - - s[B].bind(o, te.thread_axis("threadIdx.x")) - s[B].bind(i, te.thread_axis("threadIdx.y")) - - # shared memory usage: 0 - # thread usage: N - - for target in ["opencl", "cuda"]: - if not tvm.testing.device_enabled(target): - continue - - valid = [None] - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [ - ( - 2, - get_verify_pass( - valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1 - ), - ) - ] - } - ): - tvm.build(s, [A, B], target) - assert not valid[0] - - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [ - ( - 2, - get_verify_pass( - valid, max_shared_memory_per_block=0, max_threads_per_block=N - ), - ) - ] - } - ): - tvm.build(s, [A, B], target) - assert valid[0] - - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [ - ( - 2, - get_verify_pass( - valid, - max_shared_memory_per_block=0, - max_threads_per_block=N, - max_thread_y=M - 1, - ), - ) - ] - } - ): - tvm.build(s, [A, B], target) - assert not valid[0] - - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [ - ( - 2, - get_verify_pass( - valid, - max_shared_memory_per_block=0, - max_threads_per_block=N, - max_thread_y=M, - ), - ) - ] - } - ): - tvm.build(s, [A, B], target) - assert valid[0] - - -@tvm.testing.requires_gpu -def test_multiple_kernels(): - N = 1024 - - A = te.placeholder((N, N), name="A") - B = te.compute((N, N), lambda i, j: A[i, j]) - C = te.compute((N, N), lambda i, j: B[i, j]) - - s = te.create_schedule([C.op]) - - s[C].bind(s[C].op.axis[1], te.thread_axis("threadIdx.x")) - s[B].bind(s[B].op.axis[1], te.thread_axis("threadIdx.x")) - - # shared memory usage: 0 - # thread usage: N - - for target in ["opencl", "cuda"]: - if not tvm.testing.device_enabled(target): - continue - - valid = [None] - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [ - ( - 2, - get_verify_pass( - valid, max_shared_memory_per_block=0, max_threads_per_block=N - 1 - ), - ) - ] - } - ): - tvm.build(s, [A, C], target) - assert not valid[0] - - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [ - ( - 2, - get_verify_pass( - valid, max_shared_memory_per_block=0, max_threads_per_block=N - ), - ) - ] - } - ): - tvm.build(s, [A, C], target) - assert valid[0] - - -@tvm.testing.requires_gpu -def test_wrong_bind(): - N = 1024 - - A = te.placeholder((N, N - 1), name="A") - B = te.compute((N, N - 1), lambda i, j: A[i, j]) - - s = te.create_schedule([B.op]) - - # bind a thread axis to two loop axes with different lengths - s[B].bind(s[B].op.axis[0], te.thread_axis("threadIdx.x")) - s[B].bind(s[B].op.axis[1], te.thread_axis("threadIdx.x")) - - for target in ["opencl", "cuda"]: - if not tvm.testing.device_enabled(target): - continue - - valid = [None] - with tvm.transform.PassContext( - config={ - "tir.add_lower_pass": [(2, get_verify_pass(valid, max_threads_per_block=N * N))] - } - ): - tvm.build(s, [A, B], target) - assert not valid[0] - - -@tvm.testing.requires_gpu -def test_vectorize(): - N = 1024 - - A = te.placeholder((N, N), name="A") - B = te.compute((N, N), lambda i, j: A[i, j]) - - s = te.create_schedule([B.op]) - - i, j = s[B].op.axis - - s[B].bind(i, te.thread_axis("blockIdx.x")) - jo, ji = s[B].split(j, factor=64) - s[B].bind(jo, te.thread_axis("threadIdx.x")) - s[B].vectorize(ji) - - for target in ["opencl", "cuda"]: - if not tvm.testing.device_enabled(target): - continue - - valid = [None] - with tvm.transform.PassContext( - config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_vector_bytes=16))]} - ): - tvm.lower(s, [A, B]) - assert not valid[0] - - -@tvm.testing.requires_gpu -def test_vectorize_half(): - N = 1024 - - A = te.placeholder((N, N), name="A", dtype="float16") - B = te.compute((N, N), lambda i, j: A[i, j]) - - s = te.create_schedule([B.op]) - - i, j = s[B].op.axis - - s[B].bind(i, te.thread_axis("blockIdx.x")) - jo, ji = s[B].split(j, factor=8) - s[B].bind(jo, te.thread_axis("threadIdx.x")) - s[B].vectorize(ji) - - for target in ["opencl", "cuda"]: - if not tvm.testing.device_enabled(target): - continue - - valid = [None] - with tvm.transform.PassContext( - config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_vector_bytes=16))]} - ): - tvm.lower(s, [A, B]) - assert valid[0] - - -@tvm.testing.requires_gpu -def test_vectorize_strided(): - N = 1024 - - A = te.placeholder((N, N), name="A", dtype="float16") - B = te.compute((N, N), lambda i, j: A[j, i]) - - s = te.create_schedule([B.op]) - - i, j = s[B].op.axis - - s[B].bind(i, te.thread_axis("blockIdx.x")) - jo, ji = s[B].split(j, factor=8) - s[B].vectorize(ji) - - for target in ["opencl", "cuda"]: - if not tvm.testing.device_enabled(target): - continue - - valid = [None] - with tvm.transform.PassContext( - config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_vector_bytes=16))]} - ): - tvm.lower(s, [A, B]) - assert not valid[0] - - -@tvm.testing.requires_gpu -def test_vthread(): - N = 1024 - - A = te.placeholder((N, 16), name="A") - B = te.compute((N, 16), lambda i, j: A[i, j]) - - s = te.create_schedule([B.op]) - - s[B].bind(s[B].op.axis[0], te.thread_axis("blockIdx.x")) - s[B].bind(s[B].op.axis[1], te.thread_axis("vthread")) - - for target in ["opencl", "cuda"]: - if not tvm.testing.device_enabled(target): - continue - - valid = [None] - - for phase in [1, 2]: - with tvm.transform.PassContext( - config={"tir.add_lower_pass": [(phase, get_verify_pass(valid, max_vthread=16))]} - ): - tvm.build(s, [A, B], target) - assert valid[0] - - with tvm.transform.PassContext( - config={"tir.add_lower_pass": [(phase, get_verify_pass(valid, max_vthread=15))]} - ): - tvm.build(s, [A, B], target) - assert not valid[0] - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/tir-analysis/test_tir_analysis_verify_memory.py b/tests/python/tir-analysis/test_tir_analysis_verify_memory.py deleted file mode 100644 index 4c89ff1185f7..000000000000 --- a/tests/python/tir-analysis/test_tir_analysis_verify_memory.py +++ /dev/null @@ -1,121 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -import pytest -from tvm import te -import tvm.testing - -# The following DLDeviceType/TVMDeviceExtType values -# are originally defined in dlpack.h and c_runtime_api.h. -gpu_devices = ["cuda", "opencl", "metal", "vulkan"] -other_devices = ["llvm", "ext_dev"] - - -# All computations are bound. -# So VerifyMemory pass is expected to succeed. -# -@tvm.testing.uses_gpu -def test_verify_memory_all_bind(): - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B") - - # B is bound to threads. - s = te.create_schedule(B.op) - bx, tx = s[B].split(B.op.axis[0], factor=64) - s[B].bind(bx, te.thread_axis("blockIdx.x")) - s[B].bind(tx, te.thread_axis("threadIdx.x")) - - mod = tvm.lower(s, [A, B]) - - for dev_type in gpu_devices + other_devices: - if tvm.testing.device_enabled(dev_type): - binded_mod = tvm.tir.transform.Apply( - lambda f: f.with_attr("target", tvm.target.Target(dev_type)) - )(mod) - tvm.tir.transform.VerifyMemory()(binded_mod) - - -# Computations are not bound. -# So VerifyMemory pass fails when device type is GPU. -# -@tvm.testing.uses_gpu -def test_verify_memory_not_bind(): - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B") - - # B is not bound to threads. - s = te.create_schedule(B.op) - - mod = tvm.lower(s, [A, B]) - - for dev_type in gpu_devices: - if tvm.testing.device_enabled(dev_type): - binded_mod = tvm.tir.transform.Apply( - lambda f: f.with_attr("target", tvm.target.Target(dev_type)) - )(mod) - with pytest.raises(RuntimeError): - tvm.tir.transform.VerifyMemory()(binded_mod) - - for dev_type in other_devices: - if tvm.testing.device_enabled(dev_type): - binded_mod = tvm.tir.transform.Apply( - lambda f: f.with_attr("target", tvm.target.Target(dev_type)) - )(mod) - tvm.tir.transform.VerifyMemory()(binded_mod) - - -# Computations are partially bound. -# So VerifyMemory pass fails when device type is GPU. -# -@tvm.testing.uses_gpu -def test_verify_memory_partially_bind(): - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda i: A[i] + 1.0, name="B") - C = te.compute(B.shape, lambda i: B[i] + 2.0, name="C") - D = te.compute(C.shape, lambda i: C[i] + 2.0, name="D") - - # C is bound to threads, but B and D are not. - s = te.create_schedule([B.op, C.op, D.op]) - bx, tx = s[C].split(C.op.axis[0], factor=64) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - - mod = tvm.lower(s, [A, B, C, D]) - - for dev_type in gpu_devices: - if tvm.testing.device_enabled(dev_type): - binded_mod = tvm.tir.transform.Apply( - lambda f: f.with_attr("target", tvm.target.Target(dev_type)) - )(mod) - with pytest.raises(RuntimeError): - tvm.tir.transform.VerifyMemory()(binded_mod) - - for dev_type in other_devices: - if tvm.testing.device_enabled(dev_type): - binded_mod = tvm.tir.transform.Apply( - lambda f: f.with_attr("target", tvm.target.Target(dev_type)) - )(mod) - tvm.tir.transform.VerifyMemory()(binded_mod) - - -if __name__ == "__main__": - test_verify_memory_all_bind() - test_verify_memory_not_bind() - test_verify_memory_partially_bind() diff --git a/tests/python/tir-base/test_lower_build.py b/tests/python/tir-base/test_lower_build.py index 0e610cc1659b..edb3ed351e5d 100644 --- a/tests/python/tir-base/test_lower_build.py +++ b/tests/python/tir-base/test_lower_build.py @@ -18,7 +18,6 @@ import numpy as np import tvm -from tvm import te from tvm.ir.module import IRModule from tvm.script import tir as T import tvm.testing @@ -94,22 +93,6 @@ def main( ) -def test_lower_build_te_schedule(): - m, n, k = 128, 128, 128 - axis_k = te.reduce_axis((0, k), "k") - A = te.placeholder((m, k), name="A") - B = te.placeholder((k, n), name="B") - C = te.compute((m, n), lambda x, y: te.sum(A[x, axis_k] * B[y, axis_k], axis=axis_k), name="C") - s = te.create_schedule(C.op) - # check lowering with the CSE pass disabled as otherwise it would do some commoning - with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): - ir_mod = tvm.lower(s, [A, B, C]) - tvm.ir.assert_structural_equal(ir_mod, LoweredModule) - # check building - mod = tvm.build(s, [A, B, C], target="llvm") - _check_module_with_numpy(mod) - - def test_lower_build_tir_func(): # check lowering with the CSE pass disabled as otherwise it would do some commoning with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): diff --git a/tests/python/tir-base/test_tir_buffer.py b/tests/python/tir-base/test_tir_buffer.py index d706e65d8186..791de769955e 100644 --- a/tests/python/tir-base/test_tir_buffer.py +++ b/tests/python/tir-base/test_tir_buffer.py @@ -178,85 +178,6 @@ def assert_simplified_equal(index_simplified, index_direct): assert_simplified_equal(index_simplified2, index_direct) -@tvm.testing.requires_llvm -def test_buffer_broadcast(): - m0, m1, m2 = te.size_var("m0"), te.size_var("m1"), te.size_var("m2") - n0, n1, n2 = te.size_var("n0"), te.size_var("n1"), te.size_var("n2") - o0, o1, o2 = te.size_var("o0"), te.size_var("o1"), te.size_var("o2") - - A = te.placeholder((m0, m1, m2), name="A") - B = te.placeholder((n0, n1, n2), name="B") - - C = te.compute((o0, o1, o2), lambda i, j, k: A[i, j, k] + B[i, j, k], name="C") - - Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast") - Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast") - s = te.create_schedule(C.op) - - def check(): - fadd = tvm.build(s, [A, B, C], target="llvm", name="bcast_add", binds={A: Ab, B: Bb}) - dev = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(2, 4, 3)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(2, 1, 1)).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros((2, 4, 3), dtype=C.dtype), dev) - fadd(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - - check() - - -@tvm.testing.requires_llvm -def test_buffer_broadcast_expr(): - n0, m0, x = te.size_var("n0"), te.size_var("m0"), te.size_var("x") - n1, m1 = te.size_var("n1"), te.size_var("m1") - o0, o1 = te.size_var("o0"), te.size_var("o1") - - A = te.placeholder((m0, n0), name="A") - B = te.placeholder((m1, n1), name="B") - C = te.compute((o0, o1 // x), lambda i, j: A[i, j] + B[i, j], name="C") - - Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="Ab", buffer_type="auto_broadcast") - Bb = tvm.tir.decl_buffer(B.shape, B.dtype, name="Bb", buffer_type="auto_broadcast") - Cc = tvm.tir.decl_buffer(C.shape, C.dtype, name="Cc", buffer_type="auto_broadcast") - s = te.create_schedule(C.op) - - def check_stride(): - fadd = tvm.build( - s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc} - ) - dev = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev) - fadd(a, b, c, 4, 1) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - - def check_no_stride(): - fadd = tvm.build( - s, [A, B, C, o1, x], target="llvm", name="bcast_add", binds={A: Ab, B: Bb, C: Cc} - ) - dev = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev) - fadd(a, b, c, 4, 1) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - - def check_auto_bind(): - # Let build bind buffers - fadd = tvm.build(s, [A, B, C, o1, x], target="llvm", name="bcast_add") - dev = tvm.cpu(0) - a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), dev) - fadd(a, b, c, 4, 1) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - - check_stride() - check_no_stride() - check_auto_bind() - - def test_buffer_flatten(): """A buffer should flatten to a 1-d shape""" buf = tvm.tir.decl_buffer([16, 32]) diff --git a/tests/python/tir-base/test_tir_intrin.py b/tests/python/tir-base/test_tir_intrin.py index 1ee709191c41..8ab18bc84855 100644 --- a/tests/python/tir-base/test_tir_intrin.py +++ b/tests/python/tir-base/test_tir_intrin.py @@ -31,13 +31,19 @@ def test_nearbyint(): ) A = te.placeholder((m,), name="A") A_rounded = te.compute((m,), lambda *i: tvm.tir.nearbyint(A(*i)), name="A") - s = te.create_schedule(A_rounded.op) - f = tvm.build(s, [A, A_rounded], "llvm") + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, A_rounded]) + sch = tir.Schedule(mod) + + # Build from scheduled TIR + func = tvm.build(sch.mod, target="llvm") + dev = tvm.cpu(0) n = 10 a = tvm.nd.array(np.random.uniform(high=100, size=n).astype(A.dtype), dev) a_rounded = tvm.nd.array(np.random.uniform(size=n).astype(A_rounded.dtype), dev) - f(a, a_rounded) + func(a, a_rounded) # Note that numpys rint rounds to nearest integer with # ties to halfway is broken by rounding to even. # So that 1.5 and 2.5 will round 2. @@ -79,13 +85,19 @@ def run_test(tvm_intrin, np_func): ) A = te.placeholder((m,), name="A") B = te.compute((m,), lambda *i: tvm_intrin(A(*i)), name="B") - s = te.create_schedule(B.op) - f = tvm.build(s, [A, B], "llvm") + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B]) + sch = tir.Schedule(mod) + + # Build from scheduled TIR + func = tvm.build(sch.mod, target="llvm") + dev = tvm.cpu(0) n = 10 a = tvm.nd.array(np.random.uniform(0.1, 0.5, size=n).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - f(a, b) + func(a, b) tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=1e-5, rtol=1e-5) for func in test_funcs: @@ -107,14 +119,20 @@ def run_test(tvm_intrin, np_func): A = te.placeholder((m,), name="A") B = te.placeholder((m,), name="B") C = te.compute((m,), lambda *i: tvm_intrin(A(*i), B(*i)), name="C") - s = te.create_schedule(C.op) - f = tvm.build(s, [A, B, C], "llvm") + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B, C]) + sch = tir.Schedule(mod) + + # Build from scheduled TIR + func = tvm.build(sch.mod, target="llvm") + dev = tvm.cpu(0) n = 10 a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(B.dtype), dev) c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - f(a, b, c) + func(a, b, c) tvm.testing.assert_allclose(c.numpy(), np_func(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5) for func in test_funcs: @@ -128,14 +146,20 @@ def test_ldexp(): A = te.placeholder((m,), name="A") B = te.placeholder((m,), name="B", dtype="int32") C = te.compute((m,), lambda *i: tvm.tir.ldexp(A(*i), B(*i)), name="C") - s = te.create_schedule(C.op) - f = tvm.build(s, [A, B, C], "llvm") + + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B, C]) + sch = tir.Schedule(mod) + + # Build from scheduled TIR + func = tvm.build(sch.mod, target="llvm") + dev = tvm.cpu(0) n = 10 a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev) b = tvm.nd.array(np.random.randint(0, 5, size=n).astype(B.dtype), dev) c = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - f(a, b, c) + func(a, b, c) tvm.testing.assert_allclose(c.numpy(), np.ldexp(a.numpy(), b.numpy()), atol=1e-5, rtol=1e-5) @@ -162,17 +186,23 @@ def clz_np(x, dtype): m = te.var("m") A = te.placeholder((m,), name="A", dtype=dtype) B = te.compute((m,), lambda *i: tvm.tir.clz(A(*i)), name="B") - s = te.create_schedule(B.op) + # Convert to TIR and create schedule + mod = te.create_prim_func([A, B]) + sch = tir.Schedule(mod) + + # Apply scheduling primitives if target is Vulkan if target.kind.name == "vulkan": - bx, tx = s[B].split(B.op.axis[0], factor=64) + block = sch.get_block("B") + loop = sch.get_loops(block)[0] + bx, tx = sch.split(loop, factors=[None, 64]) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") - s[B].bind(bx, te.thread_axis("blockIdx.x")) - s[B].bind(tx, te.thread_axis("threadIdx.x")) + # Build from scheduled TIR + func = tvm.build(sch.mod, target=target) - f = tvm.build(s, [A, B], target) n = 10 - highs = [10, 100, 1000, 10000, 100000, 1000000] if dtype == "int64": @@ -182,7 +212,7 @@ def clz_np(x, dtype): a_np = np.random.randint(1, high=high, size=(n,), dtype=dtype) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros((n,)).astype("int32"), dev) - f(a, b) + func(a, b) ref = clz_np(a_np, dtype) np.testing.assert_equal(b.numpy(), ref) diff --git a/tests/python/tir-base/test_tir_ir_builder.py b/tests/python/tir-base/test_tir_ir_builder.py deleted file mode 100644 index 8a39337575a7..000000000000 --- a/tests/python/tir-base/test_tir_ir_builder.py +++ /dev/null @@ -1,565 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -from tvm import te -import numpy as np -import tvm.testing -from tvm.topi.math import cast - - -def test_for(): - ib = tvm.tir.ir_builder.create() - n = te.size_var("n") - A = ib.allocate("float32", n, name="A", scope="global") - with ib.for_range(0, n, name="i") as i: - A[i] = A[i] + 1 - with ib.for_range(0, 10, name="j") as j: - A[j] = A[j] + 2 - - body = ib.get() - assert isinstance(body, tvm.tir.Allocate) - body = body.body - assert isinstance(body, tvm.tir.For) - body = body.body - assert isinstance(body, tvm.tir.SeqStmt) - assert isinstance(body[1], tvm.tir.For) - - -def test_if(): - ib = tvm.tir.ir_builder.create() - n = te.size_var("n") - A = ib.pointer("float32", name="A") - tmod = tvm.tir.truncmod - with ib.for_range(0, n, name="i") as i: - with ib.if_scope(tmod(i, 2) == 0): - A[i] = A[i] + 1 - with ib.else_scope(): - A[0] = A[i] + 2 - - body = ib.get() - assert A == A - assert isinstance(body, tvm.tir.For) - body = body.body - assert isinstance(body, tvm.tir.IfThenElse) - assert isinstance(body.condition, tvm.tir.EQ) - assert isinstance(body.then_case.indices[0], tvm.tir.Var) - assert list(body.else_case.indices) == [0] - - -def test_prefetch(): - A = tvm.tir.decl_buffer((10, 20), name="A") - ib = tvm.tir.ir_builder.create() - n = te.size_var("n") - - with ib.for_range(0, n, name="i") as i: - ib.emit( - tvm.tir.Prefetch( - A, [tvm.ir.Range.from_min_extent(i + 1, 2), tvm.ir.Range.from_min_extent(0, 20)] - ) - ) - body = ib.get() - assert body.body.bounds[0].extent.value == 2 - - -def test_cpu(): - n = 1024 - dtype = "float32" - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - - def test_device_ir(A, B, C): - n = A.shape[0] - max_threads = 8 - ib = tvm.tir.ir_builder.create() - Aptr = ib.buffer_ptr(A) - Bptr = ib.buffer_ptr(B) - Cptr = ib.buffer_ptr(C) - with ib.for_range(0, n, name="i") as i: - Cptr[i] = Aptr[i] + Bptr[i] - body = ib.get() - return body - - C = te.extern( - A.shape, - [A, B], - lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]), - name="vector_add", - dtype=dtype, - ) - s = te.create_schedule(C.op) - - def check_target(target): - if not tvm.testing.device_enabled(target): - return - # build and invoke the kernel. - fadd = tvm.build(s, [A, B, C], target) - dev = tvm.device(target, 0) - # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - fadd(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - - check_target("llvm") - - -@tvm.testing.requires_gpu -def test_gpu(): - n = te.size_var("n") - dtype = "float32" - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - idxd = tvm.tir.indexdiv - - def test_device_ir(A, B, C): - n = A.shape[0] - max_threads = 32 - ib = tvm.tir.ir_builder.create() - bx = te.thread_axis("blockIdx.x") - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(bx, "thread_extent", idxd(n + max_threads - 1, max_threads)) - ib.scope_attr(tx, "thread_extent", max_threads) - idx = bx.var * max_threads + tx.var - Aptr = ib.buffer_ptr(A) - Bptr = ib.buffer_ptr(B) - Cptr = ib.buffer_ptr(C) - with ib.if_scope(ib.likely(idx < n)): - Cptr[idx] = Aptr[idx] + Bptr[idx] - body = ib.get() - return body - - C = te.extern( - A.shape, - [A, B], - lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]), - name="vector_add", - dtype=dtype, - ) - s = te.create_schedule(C.op) - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - def check_target(target): - n = 1024 - if not tvm.testing.device_enabled(target): - return - # build and invoke the kernel. - fadd = tvm.build(s, [A, B, C], target) - dev = tvm.device(target, 0) - # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - fadd(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - - check_target("opencl") - check_target("cuda") - - -def test_while_vectorize(): - """Test while loop + vectorized inner loop""" - - n = 64 - num_iter = 10 - - def test_ir(A, B, C): - ib = tvm.tir.ir_builder.create() - n = C.shape[0] - A = ib.buffer_ptr(A) - B = ib.buffer_ptr(B) - C = ib.buffer_ptr(C) - i = ib.allocate("int32", (1,), name="i", scope="local") - i[0] = 0 - - with ib.for_range(0, n) as j: - C[j] = 0.0 - - with ib.while_loop(i[0] < num_iter): - with ib.for_range(0, n, kind="vectorize") as j: - C[j] += A[j] + B[j] - i[0] += 1 - - return ib.get() - - def check_target(target, ir): - dtype = "float32" - A = te.placeholder((n,), name="A", dtype=dtype) - B = te.placeholder((n,), name="B", dtype=dtype) - - C = te.extern( - (n,), - [A, B], - lambda ins, outs: ir(ins[0], ins[1], outs[0]), - name="while_vectorize", - dtype=dtype, - ) - s = te.create_schedule(C.op) - - with tvm.transform.PassContext(opt_level=3): - func = tvm.build(s, [A, B, C], target) - - dev = tvm.device(target, 0) - a_np = np.random.uniform(size=n).astype(A.dtype) - b_np = np.random.uniform(size=n).astype(B.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - func(a, b, c) - ref = num_iter * (a_np + b_np) - tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5) - - check_target("llvm", test_ir) - - -def test_while_collatz(): - """Test while loop + if""" - - def collatz_ref(n): - a = n - i = 0 - while a > 1: - if a % 2 == 1: - a = 3 * a + 1 - else: - a = a >> 1 - i += 1 - return i - - def collatz(ib, n, C): - i = ib.allocate("int32", (1,), name="i", scope="local") - a = ib.allocate("int32", (1,), name="a", scope="local") - i[0] = 0 - a[0] = n - with ib.while_loop(a[0] > 1): - with ib.if_scope(tvm.tir.floormod(a[0], 2) == 1): - a[0] = 3 * a[0] + 1 - with ib.else_scope(): - a[0] = a[0] >> 1 - i[0] += 1 - - C[n] = i[0] - - def collatz_ir_cpu(C): - ib = tvm.tir.ir_builder.create() - n = C.shape[0] - C = ib.buffer_ptr(C) - - with ib.for_range(0, n, name="i", kind="parallel") as i: - collatz(ib, i, C) - - body = ib.get() - - return body - - n = 30 - - def check_target(target, ir): - C = te.extern( - (n,), - [], - lambda ins, outs: ir(outs[0]), - name="collatz", - dtype="int32", - ) - s = te.create_schedule(C.op) - - with tvm.transform.PassContext(opt_level=3): - func = tvm.build(s, [C], target) - - dev = tvm.device(target, 0) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - func(c) - ref = np.array([collatz_ref(i) for i in range(n)]) - tvm.testing.assert_allclose(c.numpy(), ref) - - check_target("llvm", collatz_ir_cpu) - - -def test_while_mandel(): - n = 160 - shape = (n * 2, n) - t = 300 - - def mandel_ref(): - def complex_sqr(z): - return np.array([z[0] ** 2 - z[1] ** 2, z[1] * z[0] * 2]) - - pixels = np.zeros(shape) - - for i in range(pixels.shape[0]): - for j in range(pixels.shape[1]): - c = np.array([-0.8, np.cos(t) * 0.2]) - z = np.array([i / n - 1, j / n - 0.5]) * 2 - iterations = 0 - - while np.linalg.norm(z) < 20 and iterations < 50: - z = complex_sqr(z) + c - iterations += 1 - - pixels[i, j] = 1 - iterations * 0.02 - - return pixels - - def mandel(ib, i, j, pixels): - z = ib.allocate("float32", (2,), name="z", scope="local") - tmp = ib.allocate("float32", (1,), name="tmp", scope="local") - iterations = ib.allocate("int32", (1,), name="iterations", scope="local") - - z[0] = (i / float(n) - 1) * 2 - z[1] = (j / float(n) - 0.5) * 2 - iterations[0] = 0 - c = [-0.8, float(np.cos(t)) * 0.2] - - def norm(z): - return tvm.tir.sqrt(z[0] * z[0] + z[1] * z[1]) - - with ib.while_loop(tvm.tir.all(norm(z) < 20, iterations[0] < 50)): - tmp[0] = z[0] - z[0] = z[0] * z[0] - z[1] * z[1] + c[0] - z[1] = z[1] * tmp[0] * 2 + c[1] - iterations[0] += 1 - - pixels[i, j] = 1 - iterations[0] * 0.02 - - def mandel_ir_cpu(C): - ib = tvm.tir.ir_builder.create() - ny = C.shape[0] - nx = C.shape[1] - C = ib.buffer_ptr(C) - - with ib.for_range(0, ny, name="i", kind="parallel") as i: - with ib.for_range(0, nx, name="j") as j: - mandel(ib, i, j, C) - - body = ib.get() - - return body - - def mandel_ir_gpu(C): - ib = tvm.tir.ir_builder.create() - ny = C.shape[0] - nx = C.shape[1] - C = ib.buffer_ptr(C) - - bx = te.thread_axis("blockIdx.x") - tx = te.thread_axis("threadIdx.x") - by = te.thread_axis("blockIdx.y") - ty = te.thread_axis("threadIdx.y") - - max_threads = 16 - ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(nx + max_threads - 1, max_threads)) - ib.scope_attr(tx, "thread_extent", max_threads) - ib.scope_attr(by, "thread_extent", tvm.tir.indexdiv(ny + max_threads - 1, max_threads)) - ib.scope_attr(ty, "thread_extent", max_threads) - - tidx = bx * max_threads + tx - tidy = by * max_threads + ty - - with ib.if_scope(tvm.tir.all(tidx < nx, tidy < ny)): - mandel(ib, tidy, tidx, C) - - body = ib.get() - - return body - - ref = mandel_ref() - - def check_target(target, ir): - if not tvm.testing.device_enabled(target): - return - - C = te.extern( - shape, - [], - lambda ins, outs: ir(outs[0]), - name="mandel_ir", - dtype="float32", - ) - s = te.create_schedule(C.op) - - with tvm.transform.PassContext(opt_level=3): - func = tvm.build(s, [C], target) - - dev = tvm.device(target, 0) - c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), dev) - func(c) - tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5) - - check_target("llvm", mandel_ir_cpu) - check_target("npvtx", mandel_ir_gpu) - check_target("cuda", mandel_ir_gpu) - check_target("vulkan", mandel_ir_gpu) - - -def test_while_binary_search(): - def binary_search(ib, n, i, Aptr, Bptr, Cptr): - lo = ib.allocate("int32", (1,), name="lo", scope="local") - hi = ib.allocate("int32", (1,), name="hi", scope="local") - - lo[0] = 0 - hi[0] = n - v = Bptr[i] - - with ib.while_loop(lo[0] < hi[0]): - mid = lo[0] + (hi[0] - lo[0] >> 1) - with ib.if_scope(Aptr[mid] < v): - lo[0] = mid + 1 - with ib.else_scope(): - hi[0] = mid - - Cptr[i] = lo[0] - - def searchsorted_ir_cpu(A, B, C, n): - ib = tvm.tir.ir_builder.create() - Aptr = ib.buffer_ptr(A) - Bptr = ib.buffer_ptr(B) - Cptr = ib.buffer_ptr(C) - - with ib.for_range(0, n, name="i", kind="parallel") as i: - binary_search(ib, n, i, Aptr, Bptr, Cptr) - - body = ib.get() - - return body - - def searchsorted_ir_gpu(A, B, C, n): - ib = tvm.tir.ir_builder.create() - Aptr = ib.buffer_ptr(A) - Bptr = ib.buffer_ptr(B) - Cptr = ib.buffer_ptr(C) - - bx = te.thread_axis("blockIdx.x") - tx = te.thread_axis("threadIdx.x") - max_threads = 32 - ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads)) - ib.scope_attr(tx, "thread_extent", max_threads) - tid = bx * max_threads + tx - - with ib.if_scope(tid < n): - binary_search(ib, n, tid, Aptr, Bptr, Cptr) - - body = ib.get() - - return body - - n = 1024 - dtype = "float32" - A = te.placeholder((n,), name="A", dtype=dtype) - B = te.placeholder((n,), name="B", dtype=dtype) - - def check_target(target, ir): - if not tvm.testing.device_enabled(target): - return - - C = te.extern( - A.shape, - [A, B], - lambda ins, outs: ir(ins[0], ins[1], outs[0], n), - name="searchsorted_ir", - dtype="int32", - ) - s = te.create_schedule(C.op) - - with tvm.transform.PassContext(opt_level=3): - func = tvm.build(s, [A, B, C], target) - - dev = tvm.device(target, 0) - a_np = np.random.uniform(size=n).astype(A.dtype) - b_np = np.random.uniform(size=n).astype(B.dtype) - a_np = np.sort(a_np) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - func(a, b, c) - ref = np.searchsorted(a_np, b_np) - tvm.testing.assert_allclose(c.numpy(), ref) - - check_target("llvm", searchsorted_ir_cpu) - check_target("cuda", searchsorted_ir_gpu) - check_target("nvptx", searchsorted_ir_gpu) - check_target("vulkan", searchsorted_ir_gpu) - - -@tvm.testing.requires_gpu -def test_dyn_shared(): - n = te.size_var("n") - dtype = "float32" - A = te.placeholder((n,), name="A") - - def test_device_ir(A, B): - n = A.shape[0] - ib = tvm.tir.ir_builder.create() - - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", n) - - temp = ib.allocate(dtype, (n,), scope="shared.dyn") # n is symbolic size - - Aptr = ib.buffer_ptr(A) - Bptr = ib.buffer_ptr(B) - - temp[tx] = Aptr[tx] - depth = tvm.tir.log2(cast(n, "float32")) - - with ib.for_range(0, cast(tvm.tir.ceil(depth), n.dtype)) as i: - ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) - d = n >> (i + 1) - with ib.if_scope(tx < d): - temp[tx] += temp[tx + d] - - Bptr[0] = temp[0] - return ib.get() - - B = te.extern( - (1,), - [A], - lambda ins, outs: test_device_ir(ins[0], outs[0]), - name="reduce", - dtype=dtype, - ) - s = te.create_schedule(B.op) - - def check_target(target): - if not tvm.testing.device_enabled(target): - return - - freduce = tvm.build(s, [A, B], target) - dev = tvm.device(target, 0) - - for n in [512, 1024]: - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(1, dtype=B.dtype), dev) - freduce(a, b) - tvm.testing.assert_allclose(b.numpy()[0], np.sum(a.numpy()), 1e-4, 1e-4) - - for target in ["cuda", "nvptx"]: - check_target(target) - - -if __name__ == "__main__": - test_prefetch() - test_if() - test_for() - test_cpu() - test_gpu() - test_while_vectorize() - test_while_collatz() - test_while_mandel() - test_while_binary_search() - test_dyn_shared() diff --git a/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py b/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py index cb7151f875e3..006ebf6a1a0d 100644 --- a/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py +++ b/tests/python/tir-transform/test_tir_transform_compact_buffer_region.py @@ -569,7 +569,6 @@ def expected(a: T.handle) -> None: class TestAnnotatedOpaqueAccess(BaseCompactTest): - is_lower_order_free = False @T.prim_func @@ -1154,7 +1153,6 @@ def expected( class TestNonStrictCompactionForPaddedMatmul(BaseCompactTest): - is_strict_mode = False @T.prim_func @@ -1231,7 +1229,6 @@ def expected( class TestNotCompactAliasBuffer(BaseCompactTest): - # it is not testcase on block form is_lower_order_free = False @@ -1251,7 +1248,6 @@ def before(): class TestNotCompactBufferWithDifferentDtype(BaseCompactTest): - # it is not testcase on block form is_lower_order_free = False @@ -1268,7 +1264,6 @@ def before(): class TestNonBoolCondition(BaseCompactTest): - # it is not testcase on block form is_lower_order_free = False @@ -1289,15 +1284,6 @@ def expected(): A[i - 1] = A[i - 1] + 1 -def test_lower_te(): - x = te.placeholder((1,)) - y = te.compute((1,), lambda i: x[i] + 2) - s = te.create_schedule(y.op) - orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y]) - mod = tvm.tir.transform.CompactBufferAllocation()(orig_mod) - tvm.ir.assert_structural_equal(mod, orig_mod) # CompactBufferAllocation should do nothing on TE - - class TestCompactSymbolicBound0: """Test symbolic bound that get compacted to constant""" diff --git a/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py b/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py index f920a46ba57e..63a57eeffe29 100644 --- a/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py +++ b/tests/python/tir-transform/test_tir_transform_convert_blocks_to_opaque.py @@ -74,15 +74,6 @@ def test_elementwise(): _check(elementwise_func, substituted_elementwise_func) -def test_lower_te(): - x = te.placeholder((1,)) - y = te.compute((1,), lambda i: x[i] + 2) - s = te.create_schedule(y.op) - orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y]) - mod = tvm.tir.transform.ConvertBlocksToOpaque()(orig_mod) - tvm.ir.assert_structural_equal(mod, orig_mod) # ConvertBlocksToOpaque should do nothing on TE - - class TestErrorIfPredicateUsesBlockVariables(tvm.testing.CompareBeforeAfter): transform = tvm.tir.transform.ConvertBlocksToOpaque() check_well_formed = False diff --git a/tests/python/tir-transform/test_tir_transform_flatten_buffer.py b/tests/python/tir-transform/test_tir_transform_flatten_buffer.py index 20f91b639497..b215398622cc 100644 --- a/tests/python/tir-transform/test_tir_transform_flatten_buffer.py +++ b/tests/python/tir-transform/test_tir_transform_flatten_buffer.py @@ -259,19 +259,6 @@ def expected(input_A: T.Buffer(10, "bool"), input_B: T.Buffer(10, "bool")) -> No B[i0] = T.cast(T.cast(A[i0], "bool"), "int8") -class TestLowerTE(BaseCompare): - """FlattenBuffer should do nothing on TE-based functions""" - - def before(self): - x = te.placeholder((1,)) - y = te.compute((1,), lambda i: x[i] + 2) - s = te.create_schedule(y.op) - mod = tvm.driver.build_module.schedule_to_module(s, [x, y]) - return mod["main"] - - expected = before - - class TestFlattenInsideBlock(BaseCompare): """Flattening access inside a block flattens the accessed region.""" diff --git a/tests/python/tir-transform/test_tir_transform_hoist_if.py b/tests/python/tir-transform/test_tir_transform_hoist_if.py index 04f3f9771c64..6695913a3c2c 100644 --- a/tests/python/tir-transform/test_tir_transform_hoist_if.py +++ b/tests/python/tir-transform/test_tir_transform_hoist_if.py @@ -515,34 +515,6 @@ def test_no_hoisting_7(): tvm.ir.assert_structural_equal(new_stmt, stmt) -def test_hoisting_block_scope_1(): - n = te.size_var("n") - m = te.size_var("m") - A = te.placeholder((n, m), name="A") - k = te.reduce_axis((0, m), "k") - B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B") - s = te.create_schedule(B.op) - ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) - BF = s.rfactor(B, ki) - xo, xi = s[B].split(s[B].op.axis[0], factor=32) - s[B.op].bind(xo, te.thread_axis("blockIdx.x")) - s[B.op].bind(xi, te.thread_axis("threadIdx.y")) - s[B].bind(s[B].op.reduce_axis[0], te.thread_axis("threadIdx.x")) - s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) - mod = tvm.driver.build_module.schedule_to_module(s, [A, B], "main", None) - mod = tvm.tir.transform.Simplify()(mod) - mod = tvm.tir.transform.RemoveNoOp()(mod) - stmt = mod["main"].body - new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body - tvm.ir.assert_structural_equal(new_stmt, stmt) - - with tvm.transform.PassContext( - config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}} - ): - new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body - assert not tvm.ir.structural_equal(new_stmt, stmt) - - def test_hoisting_block_scope_2(): ib = tvm.tir.ir_builder.create() dshape = (32, 64) @@ -617,37 +589,6 @@ def test_hoisting_block_scope_3(): assert not tvm.ir.structural_equal(new_stmt, stmt) -def test_hoisting_block_scope_4(): - nn = 1024 - n = tvm.runtime.convert(nn) - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - AA = te.compute((n,), lambda *i: A(*i), name="A") - BB = te.compute((n,), lambda *i: B(*i), name="B") - T = te.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T") - C = te.compute(A.shape, lambda *i: T(*i), name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=4) - xo1, xo2 = s[C].split(xo, factor=13) - s[C].parallel(xo2) - s[C].pragma(xo1, "parallel_launch_point") - s[C].pragma(xo2, "parallel_stride_pattern") - s[C].pragma(xo2, "parallel_barrier_when_finish") - s[C].vectorize(xi) - mod = tvm.driver.build_module.schedule_to_module(s, [A, B, C], "main", None) - mod = tvm.tir.transform.Simplify()(mod) - - stmt = mod["main"].body - new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body - tvm.ir.assert_structural_equal(new_stmt, stmt) - - with tvm.transform.PassContext( - config={"tir.HoistIfThenElse": {"support_block_scope_hosting": True}} - ): - new_stmt = tvm.tir.transform.HoistIfThenElse()(mod)["main"].body - assert not tvm.ir.structural_equal(new_stmt, stmt) - - def test_hoisting_block_scope_5(): ib = tvm.tir.ir_builder.create() data = ib.pointer("float32", name="data", scope="global") diff --git a/tests/python/tir-transform/test_tir_transform_inject_copy_intrin.py b/tests/python/tir-transform/test_tir_transform_inject_copy_intrin.py deleted file mode 100644 index aa0448c3c682..000000000000 --- a/tests/python/tir-transform/test_tir_transform_inject_copy_intrin.py +++ /dev/null @@ -1,124 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import tvm -import tvm.testing -from tvm import te -from tvm.driver.build_module import schedule_to_module - - -def test_copy2d(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - B = te.compute((m, l), lambda i, j: A[i, j], name="B") - s = te.create_schedule(B.op) - s[B].pragma(B.op.axis[0], "memcpy") - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None) - mod = tvm.IRModule.from_expr(func) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - - def cb(src, dst, pad_before, pad_after, pad_value): - assert dst.strides[0] == l - assert dst.strides[1].value == 1 - assert src.strides[0] == l - assert tuple(src.shape) == (m, l) - return tvm.tir.Evaluate(0) - - stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body - - -def test_copy_pad(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - B = te.compute( - (m + 2, l), - lambda i, j: tvm.tir.if_then_else(tvm.tir.all(i >= 1, i < m + 1), A[i - 1, j], 1.0), - name="B", - ) - s = te.create_schedule(B.op) - s[B].pragma(B.op.axis[0], "memcpy") - mod = schedule_to_module(s, [A, B]) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - - def cb(src, dst, pad_before, pad_after, pad_value): - tvm.testing.assert_prim_expr_equal(src.elem_offset, 0) - assert pad_before[0].value == 1 - assert pad_before[1].value == 0 - assert pad_after[0].value == 1 - assert pad_after[1].value == 0 - assert pad_value.value == 1.0 - return tvm.tir.Evaluate(0) - - stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body - - -def test_single_point_test(): - A = te.placeholder((1,), name="A") - B = te.compute((1,), lambda i: A[i], name="B") - s = te.create_schedule(B.op) - s[B].pragma(B.op.axis[0], "memcpy") - mod = schedule_to_module(s, [A, B]) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - - def cb(src, dst, pad_before, pad_after, pad_value): - tvm.testing.assert_prim_expr_equal(src.elem_offset, 0) - tvm.testing.assert_prim_expr_equal(dst.elem_offset, 0) - tvm.testing.assert_prim_expr_equal(src.strides[0], 1) - tvm.testing.assert_prim_expr_equal(dst.strides[0], 1) - return tvm.tir.Evaluate(0) - - stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body - - -def test_copy_pad_split(): - m = 4 * 3 - A = te.placeholder((m,), name="A") - Apad = te.compute( - (m + 2,), lambda i: tvm.tir.if_then_else(tvm.tir.all(i >= 1, i <= m), A[i - 1], 0.0), "Apad" - ) - B = te.compute((m,), lambda i: Apad[i] + Apad[i + 1] + Apad[i + 2]) - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=4) - s[Apad].compute_at(s[B], xo) - s[Apad].pragma(s[Apad].op.axis[0], "memcpy") - - mod = schedule_to_module(s, [A, B]) - mod = tvm.tir.transform.StorageFlatten(64)(mod._move()) - mod = tvm.tir.transform.Simplify()(mod._move()) - - def cb(src, dst, pad_before, pad_after, pad_value): - assert dst.elem_offset.value == 0 - tvm.testing.assert_prim_expr_equal(src.elem_offset, tvm.te.max(xo * 4, 1) - 1) - - rpad_before = tvm.te.max(1 - xo * 4, 0) - rpad_after = tvm.te.max(xo * 4 - 7, 0) - tvm.testing.assert_prim_expr_equal(pad_before[0], rpad_before) - tvm.testing.assert_prim_expr_equal(pad_after[0], rpad_after) - tvm.testing.assert_prim_expr_equal(src.shape[0], 6 - rpad_before - rpad_after) - return tvm.tir.Evaluate(0) - - stmt = tvm.tir.transform.InjectCopyIntrin("memcpy", cb)(mod)["main"].body - - -if __name__ == "__main__": - test_copy2d() - test_copy_pad() - test_copy_pad_split() - test_single_point_test() diff --git a/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py b/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py index c1c8141f70a7..3d8f85bf79dd 100644 --- a/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py +++ b/tests/python/tir-transform/test_tir_transform_inject_rolling_buffer.py @@ -19,186 +19,9 @@ import tvm import tvm.script from tvm import te, topi -from tvm.driver.build_module import get_binds from tvm.script import tir as T -def _tile_nd(s, tensor, tile): - outer_indices = [] - inner_indices = [] - for i, size in enumerate(tile): - outer, inner = s[tensor].split(tensor.op.axis[i], size) - outer_indices.append(outer) - inner_indices.append(inner) - - s[tensor].reorder(*outer_indices, *inner_indices) - return outer_indices, inner_indices - - -@tvm.tir.transform.prim_func_pass(opt_level=0) -def remove_rolling_buffer_attr(func, mod, ctx): - def unwrap(node): - if isinstance(node, tvm.tir.AttrStmt) and node.attr_key == "rolling_buffer_scope": - return node.body - else: - return node - - return func.with_body( - tvm.tir.stmt_functor.ir_transform( - func.body, None, postorder=unwrap, only_enable=["tir.AttrStmt"] - ) - ) - - -@tvm.tir.transform.prim_func_pass(opt_level=0) -def verify_no_rolling_buffer_attr(func, mod, ctx): - def verify(node): - if isinstance(node, tvm.tir.AttrStmt): - assert node.attr_key != "rolling_buffer_scope", "Failed to lower rolling buffers" - - tvm.tir.stmt_functor.post_order_visit(func.body, verify) - - return func - - -def _verify_schedule(sch, inputs, output): - user_pass_lists = [ - [(0, remove_rolling_buffer_attr), (0, verify_no_rolling_buffer_attr)], - [(0, tvm.tir.transform.InjectRollingBuffer()), (0, verify_no_rolling_buffer_attr)], - ] - built_funcs = [] - for user_pass_list in user_pass_lists: - with tvm.transform.PassContext(config={"tir.add_lower_pass": user_pass_list}): - built_funcs.append(tvm.build(sch, inputs + [output])) - - outputs = [] - ctx = tvm.cpu(0) - input_data = [] - for tensor in inputs: - shape = [i.value for i in tensor.shape] - input_data.append( - tvm.nd.array(np.random.randint(low=-100, high=100, size=shape).astype("int8"), ctx) - ) - shape = [i.value for i in output.shape] - out = tvm.nd.array(np.zeros(shape, dtype="int8"), ctx) - for func in built_funcs: - func(*input_data, out) - outputs.append(out.numpy()) - - np.testing.assert_equal(outputs[0], outputs[1]) - - -@pytest.mark.parametrize("tile_shape", [(1, 4, 8, 16), (1, 8, 7, 11), (1, 8, 3, 8), (1, 7, 5, 3)]) -def test_tile_shapes(tile_shape): - A = te.placeholder((1, 12, 14, 16), name="A", dtype="int8") - pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - pool_b = topi.nn.pool2d(pool_a, (3, 5), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - - sch = tvm.te.create_schedule([pool_b.op]) - oi, ii = _tile_nd(sch, pool_b, tile_shape) - sch[pool_a].compute_at(sch[pool_b], oi[-1]) - sch[pool_a].rolling_buffer() - - _verify_schedule(sch, [A], pool_b) - - -def test_implied_split(): - A = te.placeholder((1, 12, 12, 16), name="A", dtype="int8") - pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - pool_b = topi.nn.pool2d(pool_a, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - - sch = tvm.te.create_schedule([pool_b.op]) - n, h, w, c = pool_b.op.axis - oi, ii = sch[pool_b].split(w, 4) - sch[pool_a].compute_at(sch[pool_b], oi) - sch[pool_a].rolling_buffer() - - _verify_schedule(sch, [A], pool_b) - - -@pytest.mark.parametrize("kernel_shape", [(1, 1), (3, 3)]) -def test_upscale(kernel_shape): - output_shape = (1, 24, 24, 16) - input_shape = ( - output_shape[0], - output_shape[1] // 2 + 2 * (kernel_shape[0] - 1), - output_shape[2] // 2 + 2 * (kernel_shape[1] - 1), - output_shape[3], - ) - A = te.placeholder(input_shape, name="A", dtype="int8") - pool_a = topi.nn.pool2d(A, kernel_shape, (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - pool_b = topi.nn.pool2d( - pool_a, kernel_shape, (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC" - ) - upscale = te.compute((1, 24, 24, 16), lambda nn, hh, ww, cc: pool_b[nn, hh // 2, ww // 2, cc]) - - sch = tvm.te.create_schedule([upscale.op]) - oi, ii = _tile_nd(sch, upscale, (1, 5, 5, 16)) - sch[pool_b].compute_at(sch[upscale], oi[-1]) - sch[pool_b].rolling_buffer() - sch[pool_a].compute_at(sch[upscale], oi[-1]) - sch[pool_a].rolling_buffer() - - _verify_schedule(sch, [A], upscale) - - -@pytest.mark.parametrize("tile_shape", [(1, 4, 8, 16), (1, 8, 7, 11), (1, 8, 3, 8), (1, 7, 5, 3)]) -def test_3_tiled_poolings(tile_shape): - A = te.placeholder((1, 14, 14, 16), name="A", dtype="int8") - pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - pool_b = topi.nn.pool2d(pool_a, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - pool_c = topi.nn.pool2d(pool_b, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - - sch = tvm.te.create_schedule([pool_c.op]) - oi, ii = _tile_nd(sch, pool_c, tile_shape) - sch[pool_b].compute_at(sch[pool_c], oi[-1]) - sch[pool_b].rolling_buffer() - sch[pool_a].compute_at(sch[pool_c], oi[-1]) - sch[pool_a].rolling_buffer() - - _verify_schedule(sch, [A], pool_c) - - -@pytest.mark.parametrize("tile_shape", [(1, 4, 8, 16), (1, 8, 7, 11), (1, 8, 3, 8), (1, 7, 5, 3)]) -def test_tiled_added_poolings(tile_shape): - A = te.placeholder((1, 12, 12, 16), name="A", dtype="int8") - B = te.placeholder((1, 14, 14, 16), name="A", dtype="int8") - pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - pool_b = topi.nn.pool2d(B, (5, 5), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - add = topi.add(pool_a, pool_b) - pool_c = topi.nn.pool2d(add, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - - sch = tvm.te.create_schedule([pool_c.op]) - oi, ii = _tile_nd(sch, pool_c, tile_shape) - sch[add].compute_at(sch[pool_c], oi[-1]) - sch[add].rolling_buffer() - sch[pool_b].compute_at(sch[pool_c], oi[-1]) - sch[pool_b].rolling_buffer() - sch[pool_a].compute_at(sch[pool_c], oi[-1]) - sch[pool_a].rolling_buffer() - - _verify_schedule(sch, [A, B], pool_c) - - -@pytest.mark.parametrize("make_rolling", [(0, 0), (1, 0), (0, 1), (1, 1)]) -def test_mixed_buffers(make_rolling): - A = te.placeholder((1, 14, 14, 16), name="A", dtype="int8") - pool_a = topi.nn.pool2d(A, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - pool_b = topi.nn.pool2d(pool_a, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - pool_c = topi.nn.pool2d(pool_b, (3, 3), (1, 1), (1, 1), (0, 0, 0, 0), "max", layout="NHWC") - - sch = tvm.te.create_schedule([pool_c.op]) - oi, ii = _tile_nd(sch, pool_c, (1, 4, 8, 16)) - sch[pool_b].compute_at(sch[pool_c], oi[-1]) - if make_rolling[0]: - sch[pool_b].rolling_buffer() - sch[pool_a].compute_at(sch[pool_c], oi[-1]) - if make_rolling[1]: - sch[pool_a].rolling_buffer() - - _verify_schedule(sch, [A], pool_c) - - @tvm.script.ir_module class PreRollingBuffer: @T.prim_func diff --git a/tests/python/tir-transform/test_tir_transform_instrument_bound_checkers.py b/tests/python/tir-transform/test_tir_transform_instrument_bound_checkers.py deleted file mode 100644 index 3078572bb508..000000000000 --- a/tests/python/tir-transform/test_tir_transform_instrument_bound_checkers.py +++ /dev/null @@ -1,608 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import tvm -import tvm.testing -from tvm import te, tir - -import pytest -import numpy as np - - -def collect_visit(stmt, f): - ret = [] - tvm.tir.stmt_functor.post_order_visit(stmt, lambda x: ret.append(f(x))) - return ret - - -@tvm.testing.requires_llvm -@pytest.mark.xfail -def test_out_of_bounds_llvm(index_a, index_b): - n = te.size_var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda i: A[i + index_a] + B[i + index_b], name="C") - s = te.create_schedule(C.op) - tgt = "llvm" - tgt_host = "llvm" - stmt = tvm.lower(s, [A, B, C], simple_mode=True) - print(stmt) - tgt = tvm.target.Target(tgt, tgt_host) - fadd = tvm.build(s, [A, B, C], target=tgt, name="myadd") - dev = tvm.device(tgt.kind.name, 0) - a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev) - fadd(a, b, c) - - -@tvm.testing.requires_llvm -def test_in_bounds_llvm(): - n = te.size_var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - s = te.create_schedule(C.op) - tgt = "llvm" - tgt_host = "llvm" - stmt = tvm.lower(s, [A, B, C], simple_mode=True) - tgt = tvm.target.Target(tgt, tgt_host) - fadd = tvm.build(s, [A, B, C], target=tgt, name="myadd") - dev = tvm.device(tgt.kind.name, 0) - a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev) - fadd(a, b, c) - - -@tvm.testing.requires_llvm -@pytest.mark.xfail -def test_out_of_bounds_vectorize_llvm(nn, index_a, index_b): - n = tvm.runtime.convert(nn) - a = te.placeholder((n), name="a") - b = te.placeholder((n), name="b") - c = te.compute((n,), lambda i: a[i + index_a] + b[i + index_b], name="c") - s = te.create_schedule(c.op) - xo, xi = s[c].split(c.op.axis[0], factor=8) - s[c].parallel(xo) - s[c].vectorize(xi) - tgt = "llvm" - tgt_host = "llvm" - stmt = tvm.lower(s, [a, b, c], simple_mode=True) - tgt = tvm.target.Target(tgt, tgt_host) - f = tvm.build(s, [a, b, c], target=tgt, name="myaddvec") - dev = tvm.cpu(0) - n = nn - a = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(n)).astype(a.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=c.dtype), dev) - f(a, b, c) - - -@tvm.testing.requires_llvm -def test_in_bounds_vectorize_llvm(): - n = 512 - lanes = 2 - A = te.placeholder((n,), name="A", dtype="float32x%d" % lanes) - B = te.compute((n,), lambda i: A[i], name="B") - C = te.compute((n,), lambda i: B[i] + tvm.tir.const(1, A.dtype), name="C") - s = te.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], nparts=2) - _, xi = s[C].split(xi, factor=2) - s[C].parallel(xo) - s[C].vectorize(xi) - s[B].compute_at(s[C], xo) - xo, xi = s[B].split(B.op.axis[0], factor=2) - s[B].vectorize(xi) - # build and invoke the kernel. - lowered_func = tvm.lower(s, [A, C], "llvm", simple_mode=False) - f = tvm.build(s, [A, C], "llvm") - dev = tvm.cpu(0) - # launch the kernel. - a = tvm.nd.empty((n,), A.dtype).copyfrom( - np.random.uniform(size=[n] + ([] if lanes == 1 else [lanes])) - ) - c = tvm.nd.empty((n,), C.dtype, dev) - f(a, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1) - - -@tvm.testing.requires_llvm -def test_in_bounds_loop_partition_basic_llvm(): - n = te.size_var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - - T = te.compute((n,), lambda i: A[i] + B[i]) - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=4) - lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - dev = tvm.cpu(0) - - f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev) - t = tvm.nd.empty((32,), T.dtype, dev) - f(a, b, t) - - -@tvm.testing.requires_llvm -@pytest.mark.xfail -def test_out_of_bounds_loop_partition_basic_llvm(index_a, index_b): - n = te.size_var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - - T = te.compute((n,), lambda i: A[i + index_a] + B[i + index_b]) - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=4) - lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - dev = tvm.cpu(0) - - f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev) - t = tvm.nd.empty((32,), T.dtype, dev) - f(a, b, t) - - -def test_in_bounds_const_loop_partition_ir(): - def check_attr_stmt(x): - if ( - isinstance(x, tvm.tir.AttrStmt) - and x.attr_key == "buffer_bound" - and tvm.ir.structural_equal(x.value.args, [n]) - ): - return True - return False - - def check_branch_stmt(x): - if isinstance(x, tvm.tir.IfThenElse): - return True - return False - - def assert_bound_instrumentation(stmt, f, nums): - count = 0 - for i in collect_visit(stmt, f): - if i is True: - count = count + 1 - assert count == nums - - def collect_branch_stmt(x): - if isinstance(x, tvm.tir.IfThenElse): - branch_collector.append(x) - - n = tir.const(21) - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - - T = te.compute((n,), lambda i: A[i] + B[i]) - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=4) - - with tvm.transform.PassContext( - config={ - "tir.instrument_bound_checkers": True, - "tir.LoopPartition": {"partition_const_loop": True}, - } - ): - mod = tvm.driver.lower(s, [A, B, T], name="main") - - stmt = mod["main"].body - # after instrumentation - assert_bound_instrumentation(stmt, check_attr_stmt, 2 * 3) - assert_bound_instrumentation(stmt, check_branch_stmt, 2) - - branch_collector = list() - collect_visit(stmt, collect_branch_stmt) - assert len(branch_collector) == 2 - - -@tvm.testing.requires_llvm -def test_in_bounds_const_loop_partition_llvm(): - with tvm.transform.PassContext( - config={ - "tir.instrument_bound_checkers": True, - "tir.LoopPartition": {"partition_const_loop": True}, - } - ): - n = 21 - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - - T = te.compute((n,), lambda i: A[i] + B[i]) - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=4) - lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - dev = tvm.cpu(0) - - f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev) - t = tvm.nd.empty((n,), T.dtype, dev) - f(a, b, t) - - -@tvm.testing.requires_llvm -@pytest.mark.xfail -def test_out_of_bounds_const_loop_partition_llvm(index_a, index_b): - with tvm.transform.PassContext( - config={ - "tir.instrument_bound_checkers": True, - "tir.LoopPartition": {"partition_const_loop": True}, - } - ): - n = 21 - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - - T = te.compute((n,), lambda i: A[i + index_a] + B[i + index_b]) - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=4) - lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - dev = tvm.cpu(0) - - f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev) - t = tvm.nd.empty((n,), T.dtype, dev) - f(a, b, t) - - -@tvm.testing.requires_llvm -def test_in_bounds_conv_llvm(loop_tiling=False): - HSTR = WSTR = 1 - in_channel = 128 - kernel_height = kernel_width = 3 - out_channel = 64 - batch_size = 1 - in_height = in_width = 64 - out_height = out_width = in_height - kernel_height + 1 - data = te.placeholder((batch_size, in_channel, in_height, in_width), name="data") - kernel = te.placeholder((kernel_height, kernel_width, in_channel, out_channel), name="kernel") - ic = te.reduce_axis((0, in_channel), name="ic") - kh = te.reduce_axis((0, kernel_height), name="kh") - kw = te.reduce_axis((0, kernel_width), name="kw") - conv = te.compute( - (batch_size, out_channel, out_height, out_width), - lambda n, oc, oh, ow: te.sum( - data[n, ic, oh * HSTR + kh, ow * WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw] - ), - name="conv2d", - ) - s = te.create_schedule(conv.op) - - n, oc, oh, ow = conv.op.axis - if loop_tiling: - oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) - lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True) - dev = tvm.cpu(0) - - f = tvm.build(s, [data, kernel, conv], "llvm") - data_input = tvm.nd.array( - np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), dev - ) - kernel_input = tvm.nd.array( - np.random.uniform(size=(kernel_height, kernel_width, in_channel, out_channel)).astype( - "float32" - ), - dev, - ) - conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", dev) - f(data_input, kernel_input, conv_out) - - -@tvm.testing.requires_llvm -@pytest.mark.xfail -def test_out_of_bounds_conv_llvm(data_offsets, kernel_offsets, loop_tiling=False): - HSTR = WSTR = 1 - in_channel = 128 - kernel_height = kernel_width = 3 - out_channel = 64 - batch_size = 1 - in_height = in_width = 64 - out_height = out_width = in_height - kernel_height + 1 - data = te.placeholder((batch_size, in_channel, in_height, in_width), name="data") - kernel = te.placeholder((kernel_height, kernel_width, in_channel, out_channel), name="kernel") - ic = te.reduce_axis((0, in_channel), name="ic") - kh = te.reduce_axis((0, kernel_height), name="kh") - kw = te.reduce_axis((0, kernel_width), name="kw") - conv = te.compute( - (batch_size, out_channel, out_height, out_width), - lambda n, oc, oh, ow: te.sum( - data[ - n + data_offsets[0], - ic + data_offsets[1], - oh * HSTR + kh + data_offsets[2], - ow * WSTR + kw + data_offsets[3], - ] - * kernel[ - kh + kernel_offsets[0], - kw + kernel_offsets[1], - ic + kernel_offsets[2], - oc + kernel_offsets[3], - ], - axis=[ic, kh, kw], - ), - name="conv2d", - ) - s = te.create_schedule(conv.op) - - n, oc, oh, ow = conv.op.axis - if loop_tiling: - oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) - lowered_func = tvm.lower(s, [data, kernel, conv], simple_mode=True) - dev = tvm.cpu(0) - - f = tvm.build(s, [data, kernel, conv], "llvm") - data_input = tvm.nd.array( - np.random.uniform(size=(batch_size, in_channel, in_height, in_width)).astype("float32"), dev - ) - kernel_input = tvm.nd.array( - np.random.uniform(size=(kernel_height, kernel_width, in_channel, out_channel)).astype( - "float32" - ), - dev, - ) - conv_out = tvm.nd.empty((batch_size, out_channel, out_height, out_width), "float32", dev) - f(data_input, kernel_input, conv_out) - - -@tvm.testing.requires_llvm -def test_in_bounds_tensors_with_same_shapes1D_llvm(): - n = te.size_var("n") - k = te.size_var("k") - m = te.size_var("m") - A = te.placeholder((n,), name="A") - B = te.placeholder((k,), name="B") - - T = te.compute((m,), lambda i: A[i] * B[i]) - s = te.create_schedule(T.op) - lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - dev = tvm.cpu(0) - - f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(32,)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(32,)).astype(B.dtype), dev) - t = tvm.nd.empty((32,), T.dtype, dev) - f(a, b, t) - - -@tvm.testing.requires_llvm -@pytest.mark.xfail -def test_out_of_bounds_tensors_with_diff_shapes1D_llvm(a_shape, b_shape, c_shape): - n = te.size_var("n") - k = te.size_var("k") - m = te.size_var("m") - A = te.placeholder((n,), name="A") - B = te.placeholder((k,), name="B") - - T = te.compute((m,), lambda i: A[i] * B[i]) - s = te.create_schedule(T.op) - lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - dev = tvm.cpu(0) - - f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(a_shape,)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(b_shape,)).astype(B.dtype), dev) - t = tvm.nd.empty((c_shape,), T.dtype, dev) - f(a, b, t) - - -@tvm.testing.requires_llvm -def test_in_bounds_tensors_with_same_shapes2D_llvm(): - n = te.size_var("n") - k = te.size_var("k") - m = te.size_var("m") - A = te.placeholder((n, n), name="A") - B = te.placeholder((k, k), name="B") - - T = te.compute((m, m), lambda i, j: A[i][j] * B[i][j]) - s = te.create_schedule(T.op) - lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - dev = tvm.cpu(0) - - f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(32, 32)).astype(B.dtype), dev) - t = tvm.nd.empty((32, 32), T.dtype, dev) - f(a, b, t) - - -@tvm.testing.requires_llvm -@pytest.mark.xfail -def test_out_of_bounds_tensors_with_diff_shapes2D_llvm(a_shape, b_shape, c_shape): - n = te.size_var("n") - k = te.size_var("k") - m = te.size_var("m") - A = te.placeholder((n, n), name="A") - B = te.placeholder((k, k), name="B") - - T = te.compute((m, m), lambda i, j: A[i][j] * B[i][j]) - s = te.create_schedule(T.op) - lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - dev = tvm.cpu(0) - - f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(a_shape[0], a_shape[1])).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(b_shape[0], b_shape[1])).astype(B.dtype), dev) - t = tvm.nd.empty((c_shape[0], c_shape[1]), T.dtype, dev) - f(a, b, t) - - -@tvm.testing.requires_llvm -def test_in_bounds_tensors_with_same_shapes3D_llvm(): - n = te.size_var("n") - k = te.size_var("k") - m = te.size_var("m") - A = te.placeholder((n, n, n), name="A") - B = te.placeholder((k, k, k), name="B") - - T = te.compute((m, m, m), lambda i, j, p: A[i][j][p] * B[i][j][p]) - s = te.create_schedule(T.op) - lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - - dev = tvm.cpu(0) - - f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=(32, 32, 32)).astype(B.dtype), dev) - t = tvm.nd.empty((32, 32, 32), T.dtype, dev) - f(a, b, t) - - -@tvm.testing.requires_llvm -@pytest.mark.xfail -def test_out_of_bounds_tensors_with_diff_shapes3D_llvm(a_shape, b_shape, c_shape): - n = te.size_var("n") - k = te.size_var("k") - m = te.size_var("m") - A = te.placeholder((n, n, n), name="A") - B = te.placeholder((k, k, k), name="B") - - T = te.compute((m, m, m), lambda i, j, p: A[i][j][p] * B[i][j][p]) - s = te.create_schedule(T.op) - lowered_func = tvm.lower(s, [A, B, T], "llvm", simple_mode=False) - - dev = tvm.cpu(0) - - f = tvm.build(s, [A, B, T], "llvm") - a = tvm.nd.array( - np.random.uniform(size=(a_shape[0], a_shape[1], c_shape[2])).astype(A.dtype), dev - ) - b = tvm.nd.array( - np.random.uniform(size=(b_shape[0], b_shape[1], b_shape[2])).astype(B.dtype), dev - ) - t = tvm.nd.empty((c_shape[0], c_shape[1], c_shape[2]), T.dtype, dev) - f(a, b, t) - - -@tvm.testing.requires_llvm -@pytest.mark.xfail -def test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm(): - n = 64 - A = te.placeholder((n,), name="A") - scale = te.placeholder((), name="scale") - k = te.reduce_axis((0, n), name="k") - C = te.compute((), lambda: te.sum(A[k + k + k] * scale, axis=k), name="C") - D = te.compute((), lambda: C + 1) - s = te.create_schedule(D.op) - stmt = tvm.lower(s, [A, scale, D], simple_mode=True) - - # build and invoke the kernel. - f = tvm.build(s, [A, scale, D], "llvm") - dev = tvm.cpu(0) - # launch the kernel. - a = tvm.nd.array(np.random.randint(0, 2, size=(n,)).astype(A.dtype), dev) - sc = tvm.nd.array(np.random.randint(0, 2, size=()).astype(scale.dtype), dev) - d = tvm.nd.empty((), D.dtype, dev) - f(a, sc, d) - d_np = np.sum(a.numpy()) * sc.numpy() + 1 - tvm.testing.assert_allclose(d.numpy(), d_np) - - -if __name__ == "__main__": - with tvm.transform.PassContext( - config={ - "tir.instrument_bound_checkers": True, - } - ): - # zero scale - test_out_of_bounds_tensors_with_zero_shape_op_with_not_zero_shape_llvm() - # in bound - test_in_bounds_llvm() - # upper bound - test_out_of_bounds_llvm(1, 0) - test_out_of_bounds_llvm(0, 1) - test_out_of_bounds_llvm(1, 1) - test_out_of_bounds_llvm(10000, 0) - test_out_of_bounds_llvm(0, 10000) - test_out_of_bounds_llvm(10000, 10000) - # lower bound - test_out_of_bounds_llvm(-1, 0) - test_out_of_bounds_llvm(0, -1) - test_out_of_bounds_llvm(-1, -1) - test_out_of_bounds_llvm(-10000, 0) - test_out_of_bounds_llvm(0, -10000) - test_out_of_bounds_llvm(-10000, -10000) - # vectorize in bound - test_in_bounds_vectorize_llvm() - # vectorization upper bound - test_out_of_bounds_vectorize_llvm(1024, 1000, 0) - test_out_of_bounds_vectorize_llvm(1024, 0, 10000) - # vectorization lower bound - test_out_of_bounds_vectorize_llvm(1024, -1000, 0) - test_out_of_bounds_vectorize_llvm(1024, 0, -10000) - test_in_bounds_const_loop_partition_llvm() - test_out_of_bounds_const_loop_partition_llvm(1, 0) - test_out_of_bounds_const_loop_partition_llvm(0, 1) - test_out_of_bounds_const_loop_partition_llvm(-1, 0) - test_out_of_bounds_const_loop_partition_llvm(0, -1) - test_in_bounds_loop_partition_basic_llvm() - test_out_of_bounds_loop_partition_basic_llvm(32, 0) - test_out_of_bounds_loop_partition_basic_llvm(0, 32) - test_out_of_bounds_loop_partition_basic_llvm(-32, 0) - test_out_of_bounds_loop_partition_basic_llvm(0, -32) - # conv - test_in_bounds_conv_llvm() - test_out_of_bounds_conv_llvm([1, 0, 0, 0], [0, 0, 0, 0]) - test_out_of_bounds_conv_llvm([0, 1, 0, 0], [0, 0, 0, 0]) - test_out_of_bounds_conv_llvm([0, 0, 1, 0], [0, 0, 0, 0]) - test_out_of_bounds_conv_llvm([0, 0, 0, 1], [0, 0, 0, 0]) - test_out_of_bounds_conv_llvm([-1, 0, 0, 0], [0, 0, 0, 0]) - test_out_of_bounds_conv_llvm([0, -1, 0, 0], [0, 0, 0, 0]) - test_out_of_bounds_conv_llvm([0, 0, -1, 0], [0, 0, 0, 0]) - test_out_of_bounds_conv_llvm([0, 0, 0, -1], [0, 0, 0, 0]) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [1, 0, 0, 0]) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 1, 0, 0]) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 1, 0]) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, 1]) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [-1, 0, 0, 0]) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, -1, 0, 0]) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, -1, 0]) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, -1]) - # loop tiling - test_in_bounds_conv_llvm(True) - test_out_of_bounds_conv_llvm([1, 0, 0, 0], [0, 0, 0, 0], True) - test_out_of_bounds_conv_llvm([0, 1, 0, 0], [0, 0, 0, 0], True) - test_out_of_bounds_conv_llvm([0, 0, 1, 0], [0, 0, 0, 0], True) - test_out_of_bounds_conv_llvm([0, 0, 0, 1], [0, 0, 0, 0], True) - test_out_of_bounds_conv_llvm([-1, 0, 0, 0], [0, 0, 0, 0], True) - test_out_of_bounds_conv_llvm([0, -1, 0, 0], [0, 0, 0, 0], True) - test_out_of_bounds_conv_llvm([0, 0, -1, 0], [0, 0, 0, 0], True) - test_out_of_bounds_conv_llvm([0, 0, 0, -1], [0, 0, 0, 0], True) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [1, 0, 0, 0], True) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 1, 0, 0], True) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 1, 0], True) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, 1], True) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [-1, 0, 0, 0], True) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, -1, 0, 0], True) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, -1, 0], True) - test_out_of_bounds_conv_llvm([0, 0, 0, 0], [0, 0, 0, -1], True) - # tensors with diff shapes basic operation such as mul - test_out_of_bounds_tensors_with_diff_shapes1D_llvm(32, 64, 64) - test_out_of_bounds_tensors_with_diff_shapes1D_llvm(64, 32, 64) - test_out_of_bounds_tensors_with_diff_shapes2D_llvm([64, 64], [32, 32], [64, 64]) - test_out_of_bounds_tensors_with_diff_shapes2D_llvm([32, 32], [64, 64], [64, 64]) - test_out_of_bounds_tensors_with_diff_shapes3D_llvm([64, 64, 64], [32, 32, 32], [64, 64, 64]) - test_out_of_bounds_tensors_with_diff_shapes3D_llvm([32, 32, 32], [64, 64, 64], [64, 64, 64]) - # check tensors with the same shapes - test_in_bounds_tensors_with_same_shapes1D_llvm() - test_in_bounds_tensors_with_same_shapes2D_llvm() - test_in_bounds_tensors_with_same_shapes3D_llvm() - # ir tests - test_in_bounds_const_loop_partition_ir() diff --git a/tests/python/tir-transform/test_tir_transform_loop_partition.py b/tests/python/tir-transform/test_tir_transform_loop_partition.py index 5f24d1666fe6..bec4129ffcbf 100644 --- a/tests/python/tir-transform/test_tir_transform_loop_partition.py +++ b/tests/python/tir-transform/test_tir_transform_loop_partition.py @@ -29,74 +29,6 @@ def collect_visit(stmt, f): return ret -def test_basic(): - n = te.size_var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - - T = te.compute((n,), lambda i: A[i] + B[i]) - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=4) - - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([n], stmt).with_attr("global_symbol", "main")) - mod = tvm.tir.transform.LoopPartition()(mod) - stmt = tvm.tir.transform.Simplify()(mod)["main"] - - assert not any(collect_visit(stmt.body.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse))) - assert any(collect_visit(stmt.body.body[1], lambda x: isinstance(x, tvm.tir.IfThenElse))) - - -def test_const_loop(): - n = 21 - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - - T = te.compute((n,), lambda i: A[i] + B[i]) - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=4) - - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main")) - with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}): - mod = tvm.tir.transform.LoopPartition()(mod) - stmt = tvm.tir.transform.Simplify()(mod)["main"].body - - assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))) - - -def test_no_unroll_loop(): - n = 21 - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - - T = te.compute((n,), lambda i: A[i] + B[i]) - s = te.create_schedule(T.op) - xo, xi = s[T].split(T.op.axis[0], factor=4) - - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main")) - with tvm.transform.PassContext( - config={ - "tir.LoopPartition": { - "partition_const_loop": True, - "no_unroll_loop_with_extent_one": True, - } - } - ): - mod = tvm.tir.transform.LoopPartition()(mod) - mod = tvm.tir.transform.Simplify()(mod) - stmt = tvm.tir.transform.RemoveNoOp()(mod)["main"].body - - assert sum(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.For))) == 4 - - def test_multi_loop(): ib = tvm.tir.ir_builder.create() m = te.size_var("m") @@ -141,52 +73,6 @@ def test_multi_if(): assert not any(collect_visit(stmt.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse))) -def test_thread_axis(): - m = te.size_var("m") - l = te.size_var("l") - A = te.placeholder((m, l), name="A") - B = te.compute((m, l), lambda i, j: A[i, j] + 3, name="B") - s = te.create_schedule(B.op) - - s[B].set_scope("shared") - num_thread = 16 - xo, xi = s[B].split(B.op.axis[0], 32) - xi0, xi1 = s[B].split(xi, nparts=num_thread) - s[B].bind(xi0, te.thread_axis("threadIdx.x")) - - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main")) - mod = tvm.tir.transform.LoopPartition()(mod) - stmt = tvm.tir.transform.Simplify()(mod)["main"] - - assert not any(collect_visit(stmt.body.body[0], lambda x: isinstance(x, tvm.tir.IfThenElse))) - - -def test_vectorize(): - n = te.size_var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - bias = te.size_var("bias", dtype="float32") - scale = te.size_var("scale", dtype="float32") - C = te.compute(A.shape, lambda *i: A(*i) + B(*i) * scale + bias, name="C") - # schedule - s = te.create_schedule(C.op) - # create iter var and assign them tags. - num_thread = 32 - bx, x = s[C].split(C.op.axis[0], factor=num_thread * 4) - tx, x = s[C].split(x, nparts=num_thread) - _, x = s[C].split(x, factor=4) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - s[C].vectorize(x) - stmt = tvm.lower(s, [A, B], name="main")["main"] - body = stmt.body.body.body.body - assert x.var.name not in str(body.condition) - assert any(collect_visit(body.then_case, lambda x: isinstance(x, tvm.tir.Ramp))) - - def test_condition(): ib = tvm.tir.ir_builder.create() m = te.size_var("m") @@ -219,24 +105,6 @@ def test_condition_EQ(): assert not any(collect_visit(stmt[0], lambda x: isinstance(x, tvm.tir.Select))) -def test_thread_axis2(): - n = tvm.runtime.convert(4096) - m = te.size_var("m") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - s = te.create_schedule(C.op) - num_thread = 32 - bx, x = s[C].split(C.op.axis[0], factor=32) - tx, x = s[C].split(x, nparts=num_thread) - _, x = s[C].split(x, factor=m) - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - stmt = tvm.lower(s, [A, B], name="main")["main"] - for_body = stmt.body.body.body.body[0] - assert "threadIdx" not in str(for_body.extent) - - def test_everything_during_deduction(): m = te.size_var("m") n = te.size_var("n") @@ -255,55 +123,6 @@ def test_everything_during_deduction(): assert isinstance(stmt.body.body, tvm.tir.IfThenElse) -def test_single_likely(): - n = 60 - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - - T = te.compute((n,), lambda i: A[i] + B[i]) - s = te.create_schedule(T.op) - x = T.op.axis[0] - xo, xi = s[T].split(x, factor=16) - - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main")) - - with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}): - mod = tvm.tir.transform.LoopPartition()(mod) - stmt = tvm.tir.transform.Simplify()(mod)["main"].body - - assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))) - - -def test_multi_likely(): - n = 94 - m = 62 - A = te.placeholder((n, m), name="A") - B = te.placeholder((n, m), name="B") - - T = te.compute((n, m), lambda i, j: A[i, j] + B[i, j]) - s = te.create_schedule(T.op) - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - x, y = T.op.axis - xo, xi = s[T].split(x, factor=16) - yo, yi = s[T].split(y, factor=16) - s[T].reorder(xo, yo, xi, yi) - - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main")) - - with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}): - mod = tvm.tir.transform.LoopPartition()(mod) - stmt = tvm.tir.transform.Simplify()(mod)["main"].body - - assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))) - - def test_oneD_pool(): m = te.size_var("m") ib = tvm.tir.ir_builder.create() @@ -415,135 +234,6 @@ def test_cce_loop_3(): assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))) -def test_conv_tiling(): - HSTR = WSTR = 1 - in_channel = 128 - kernel_height = kernel_width = 3 - out_channel = 64 - batch_size = 1 - in_height = in_width = 64 - out_height = out_width = in_height - kernel_height + 1 - data = te.placeholder((batch_size, in_channel, in_height, in_width), name="data") - kernel = te.placeholder((kernel_height, kernel_width, in_channel, out_channel), name="kernel") - ic = te.reduce_axis((0, in_channel), name="ic") - kh = te.reduce_axis((0, kernel_height), name="kh") - kw = te.reduce_axis((0, kernel_width), name="kw") - conv = te.compute( - (batch_size, out_channel, out_height, out_width), - lambda n, oc, oh, ow: te.sum( - data[n, ic, oh * HSTR + kh, ow * WSTR + kw] * kernel[kh, kw, ic, oc], axis=[ic, kh, kw] - ), - name="conv2d", - ) - s = te.create_schedule(conv.op) - - n, oc, oh, ow = conv.op.axis - oho, owo, ohi, owi = s[conv].tile(oh, ow, 16, 16) - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt).with_attr("global_symbol", "main")) - with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}): - mod = tvm.tir.transform.LoopPartition()(mod) - stmt = tvm.tir.transform.Simplify()(mod)["main"].body - - assert not any(collect_visit(stmt, lambda x: isinstance(x, tvm.tir.IfThenElse))) - - -def test_multilevel_splitting_with_indivisble_factors(): - from tvm import topi - - A = te.placeholder((130,), dtype="float32") - B = topi.nn.relu(A) - s = te.create_schedule(B.op) - (y,) = s[B].op.axis - (yo, yi) = s[B].split(y, factor=8) - (yoo, yoi) = s[B].split(yo, factor=16) - s[B].reorder(yoo, yoi, yi) - s[B].unroll(yi) - - ## But this does the right thing. - with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}): - lowered_body = tvm.lower(s, [A, B], name="x")["x"].body - - def visit_stmt(op): - return isinstance(op, tvm.tir.Max) - - num_max = collect_visit(lowered_body, visit_stmt) - assert num_max.count(True) == 10 - - -def test_double_splitting_with_indivisible_factors(): - m = 48 - dtype = "float32" - A = te.placeholder((m,), name="A", dtype=dtype) - C = te.compute((m,), lambda i: A[i], name="C") - D = te.compute((m,), lambda i: C[i], name="D") - - s = te.create_schedule(D.op) - co, ci = s[C].split(C.op.axis[0], factor=10) - do, di = s[D].split(D.op.axis[0], 32) - s[C].compute_at(s[D], do) - - target = "llvm" - with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}): - f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False) - func = tvm.build(f, target=target) - - top_produce = f["fadd1"].body - assert not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.tir.IfThenElse))) - - # check functional correctness of generated code - dev = tvm.device(target, 0) - a = tvm.nd.array( - numpy.ones( - m, - ).astype(dtype), - dev, - ) - c = tvm.nd.array( - numpy.zeros( - m, - ).astype(dtype), - dev, - ) - d = tvm.nd.array( - numpy.zeros( - m, - ).astype(dtype), - dev, - ) - func(a, c, d) - tvm.testing.assert_allclose(c.numpy(), a.numpy(), rtol=1e-5) - tvm.testing.assert_allclose(d.numpy(), a.numpy(), rtol=1e-5) - - -def test_simple_rfactor(): - K = 16 * 4 + 4 - k = te.reduce_axis((0, K), "k") - - A = te.placeholder((1, K), name="A") - - B = te.compute((1,), lambda b: te.sum(A[b, k], axis=k), name="B") - - s = te.create_schedule(B.op) - ko, _ = s[B].split(s[B].op.reduce_axis[0], 16) - BF = s.rfactor(B, ko, 0) - - s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt1 = tvm.te.schedule.ScheduleOps(s, bounds) - - mod1 = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt1).with_attr("global_symbol", "main")) - stmt1 = tvm.tir.transform.Simplify()(mod1)["main"].body - - with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}): - mod2 = tvm.tir.transform.LoopPartition()(mod1) - stmt2 = tvm.tir.transform.Simplify()(mod2)["main"].body - - # make sure loop partition actually did something - assert not tvm.ir.structural_equal(stmt1.body, stmt2.body) - - @T.prim_func def partitioned_concat( A: T.Buffer((16,), "float32"), B: T.Buffer((16,), "float32"), C: T.Buffer((32,), "float32") @@ -555,21 +245,6 @@ def partitioned_concat( C[i + 16] = B[i + 16] -def test_explicit_partition_hint(): - A = te.placeholder((16,), name="A") - B = te.placeholder((16,), name="B") - C = te.compute((32,), lambda i: te.if_then_else(i < 16, A[i], B[i]), name="C") - s = te.create_schedule(C.op) - s.normalize() - s[C].pragma(s[C].op.axis[0], "loop_partition_hint", True) - mod = tvm.driver.build_module.schedule_to_module(s, [A, B, C], "main", None) - with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}): - mod = tvm.tir.transform.StorageFlatten(64)(mod) - mod = tvm.tir.transform.LoopPartition()(mod) - mod = tvm.tir.transform.Simplify()(mod) - tvm.ir.assert_structural_equal(mod["main"], partitioned_concat) - - def partition_from_scheduled_tir(prim_func, pass_cfg, do_flatten=True): with tvm.transform.PassContext(config=pass_cfg): mod = IRModule.from_expr(prim_func.with_attr("global_symbol", "main")) diff --git a/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py b/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py index 35b4d55ea51d..63700853b36a 100644 --- a/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py +++ b/tests/python/tir-transform/test_tir_transform_lower_cross_thread_reduction.py @@ -1897,21 +1897,6 @@ def test_no_thread_broadcast_rewrite(): _check(no_thread_broadcast, lowered_no_thread_broadcast) -def test_lower_te(): - a = te.placeholder((32, 2, 2)) - k1 = te.reduce_axis((0, 2), "k1") - k2 = te.reduce_axis((0, 2), "k2") - b = te.compute((32,), lambda i: te.sum(a[i, k1, k2], axis=[k1, k2])) - s = te.create_schedule(b.op) - s[b].bind(k1, te.thread_axis("threadIdx.x")) - s[b].bind(k2, te.thread_axis("threadIdx.y")) - orig_mod = tvm.driver.build_module.schedule_to_module(s, [a, b]) - mod = tvm.tir.transform.LowerCrossThreadReduction()(orig_mod) - tvm.ir.assert_structural_equal( - mod, orig_mod - ) # LowerCrossThreadReduction should do nothing on TE - - def test_layer_norm_tuple_sum(): _check(layer_norm_tuple_sum, lowered_layer_norm_tuple_sum) diff --git a/tests/python/tir-transform/test_tir_transform_lower_init_block.py b/tests/python/tir-transform/test_tir_transform_lower_init_block.py index 3ada747f6915..d05b8bc71f46 100644 --- a/tests/python/tir-transform/test_tir_transform_lower_init_block.py +++ b/tests/python/tir-transform/test_tir_transform_lower_init_block.py @@ -105,15 +105,6 @@ def test_lower_match_buffer(): tvm.ir.assert_structural_equal(mod, BranchWithMatchBuffer, True) -def test_lower_te(): - x = te.placeholder((1,)) - y = te.compute((1,), lambda i: x[i] + 2) - s = te.create_schedule(y.op) - orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y]) - mod = tvm.tir.transform.LowerInitBlock()(orig_mod) - tvm.ir.assert_structural_equal(mod, orig_mod) # LowerInitBlock should do nothing on TE - - if __name__ == "__main__": test_lower_reduction() test_lower_match_buffer() diff --git a/tests/python/tir-transform/test_tir_transform_lower_intrin.py b/tests/python/tir-transform/test_tir_transform_lower_intrin.py index 0764daac461a..3eb642fb51b3 100644 --- a/tests/python/tir-transform/test_tir_transform_lower_intrin.py +++ b/tests/python/tir-transform/test_tir_transform_lower_intrin.py @@ -47,9 +47,7 @@ def make_binds(i): return x C = te.compute((n,), make_binds) - s = te.create_schedule([C.op]) - - f = tvm.build(s, [A, B, C], "llvm") + f = tvm.build(te.create_prim_func([A, B, C]), "llvm") a = tvm.nd.array(np.array([x for x, y in data], dtype=expr.dtype)) b = tvm.nd.array(np.array([y for x, y in data], dtype=expr.dtype)) c = tvm.nd.array(np.zeros(len(data), dtype=expr.dtype)) diff --git a/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py b/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py index ae44d2127595..dbaafb617aad 100644 --- a/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py +++ b/tests/python/tir-transform/test_tir_transform_lower_opaque_block.py @@ -349,15 +349,6 @@ def test_symbolic_strided_buffer(): _check(compacted_symbolic_strided_buffer_func, transformed_symbolic_strided_buffer_func) -def test_lower_te(): - x = te.placeholder((1,)) - y = te.compute((1,), lambda i: x[i] + 2) - s = te.create_schedule(y.op) - orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y]) - mod = tvm.tir.transform.LowerOpaqueBlock()(orig_mod) - tvm.ir.assert_structural_equal(mod, orig_mod) # LowerOpaqueBlock should do nothing on TE - - def test_annotated_loops(): mod = tvm.IRModule.from_expr(annotated_loops.with_attr("global_symbol", "main")) mod = tvm.tir.transform.LowerOpaqueBlock()(mod) diff --git a/tests/python/tir-transform/test_tir_transform_lower_warp_memory.py b/tests/python/tir-transform/test_tir_transform_lower_warp_memory.py deleted file mode 100644 index 99ccc5556585..000000000000 --- a/tests/python/tir-transform/test_tir_transform_lower_warp_memory.py +++ /dev/null @@ -1,356 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import numpy as np -import pytest -import tvm -import tvm.testing -from tvm import te, tir -from tvm.contrib.nvcc import have_fp16 - - -def _run_passes(mod): - cuda_target = tvm.target.Target("cuda", host="llvm") - assert cuda_target.thread_warp_size == 32 - mod = tvm.tir.transform.Apply(lambda f: f.with_attr("target", cuda_target))(mod) - mod = tvm.tir.transform.AnnotateDeviceRegions()(mod) - mod = tvm.tir.transform.SplitHostDevice()(mod) - mod = tvm.tir.transform.LowerWarpMemory()(mod) - return mod - - -@tvm.testing.requires_cuda -def test_lower_warp_memory_local_scope(): - m = 128 - A = te.placeholder((m,), name="A") - B = te.compute((m,), lambda i: A[i] + 3, name="B") - - s = te.create_schedule(B.op) - AA = s.cache_read(A, "warp", [B]) - xo, xi = s[B].split(B.op.axis[0], 64) - xi0, xi1 = s[B].split(xi, factor=32) - tx = te.thread_axis("threadIdx.x") - s[B].bind(xi1, tx) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[AA].compute_at(s[B], xo) - xo, xi = s[AA].split(s[AA].op.axis[0], 32) - s[AA].bind(xi, tx) - - # lowering with the CSE pass disabled as otherwise it would do some commoning - with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): - mod = tvm.lower(s, [A, B], name="f") - - mod = _run_passes(mod) - fdevice = mod["f_kernel"] - - allocate = fdevice - while not isinstance(allocate, tir.Allocate): - allocate = allocate.body - - assert allocate.buffer_var.type_annotation.storage_scope == "local" - assert allocate.extents[0].value == 2 - - -@tvm.testing.requires_cuda -def test_lower_warp_memory_correct_indices(): - n = 32 - A = te.placeholder((2, n, n), name="A", dtype="float32") - C = te.compute((2, n, n), lambda x, i, j: A(x, i, (j + 1) % n), name="C") - - s = te.create_schedule(C.op) - bk_x = te.thread_axis("blockIdx.x") - th_y = te.thread_axis("threadIdx.y") - th_x = te.thread_axis("threadIdx.x") - B = s.cache_read(A, "warp", [C]) - cx, ci, cj = C.op.axis - bx, bi, bj = B.op.axis - s[C].bind(cj, th_x) - s[C].bind(cx, bk_x) - s[B].compute_at(s[C], cx) - s[B].bind(bi, th_y) - s[B].bind(bj, th_x) - - bounds = tvm.te.schedule.InferBound(s) - ir = tvm.te.schedule.ScheduleOps(s, bounds) - inner_func = ir.body.body.body - store_A_warp = inner_func.seq[0].body.body - indices = list(store_A_warp.indices) - - # A.warp is actually many buffers, one for each warp, although they are all called A.warp - # 1. If we are accessing from different threads within a same warp (different - # threadIdx.x), we need to distinguish between each elements using threadIdx.x, - # so threadIdx.x is one if the indices. - # 2. If we are accessing from different warps (different threadIdx.y), we are actually - # assessing different buffers, so there is no need to distinguish from elements, - # and therefore threadIdx.y is NOT a index. - idx_names = map(lambda x: x.name, filter(lambda x: type(x) is tvm.tir.expr.Var, indices)) - assert "threadIdx.x" in idx_names - assert "threadIdx.y" not in idx_names - - -@tvm.testing.requires_gpu -@tvm.testing.requires_cuda -def test_lower_warp_memory_cuda_end_to_end(): - def check_cuda(dtype): - if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): - print("Skip because gpu does not have fp16 support") - return - - m = 128 - A = te.placeholder((m,), name="A", dtype=dtype) - B = te.compute((m,), lambda i: A[i // 32 * 32 + (i + 1) % 32], name="B") - - cuda_target = tvm.target.Target("cuda", host="llvm") - assert cuda_target.thread_warp_size == 32 - with cuda_target: - s = te.create_schedule(B.op) - AA = s.cache_read(A, "warp", [B]) - xo, xi = s[B].split(B.op.axis[0], 64) - xi0, xi1 = s[B].split(xi, factor=32) - tx = te.thread_axis("threadIdx.x") - s[B].bind(xi1, tx) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[AA].compute_at(s[B], xo) - xo, xi = s[AA].split(s[AA].op.axis[0], 32) - s[AA].bind(xi, tx) - - dev = tvm.cuda(0) - # building with the CSE pass disabled as otherwise it would do some commoning - with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): - func = tvm.build(s, [A, B], "cuda") - A_np = np.array(list(range(m)), dtype=dtype) - B_np = np.array( - list(range(1, 32)) - + [0] - + list(range(33, 64)) - + [32] - + list(range(65, 96)) - + [64] - + list(range(97, 128)) - + [96], - dtype=dtype, - ) - A_nd = tvm.nd.array(A_np, dev) - B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev) - func(A_nd, B_nd) - tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3) - - check_cuda("float32") - check_cuda("float16") - - -@tvm.testing.requires_gpu -@tvm.testing.requires_cuda -def test_lower_warp_memory_cuda_half_a_warp(): - def check_cuda(dtype): - if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): - print("Skip because gpu does not have fp16 support") - return - - n, m = 16, 16 - A = te.placeholder( - ( - n, - m, - ), - name="A", - dtype=dtype, - ) - B = te.compute( - ( - n, - m, - ), - lambda j, i: A[j, (i + 1) % m], - name="B", - ) - - cuda_target = tvm.target.Target("cuda", host="llvm") - assert cuda_target.thread_warp_size == 2 * m - with cuda_target: - s = te.create_schedule(B.op) - tx = te.thread_axis("threadIdx.x") - ty = te.thread_axis("threadIdx.y") - bx = te.thread_axis("blockIdx.x") - - AA = s.cache_read(A, "warp", [B]) - y, x = B.op.axis - z, y = s[B].split(y, nparts=2) - s[B].bind(x, tx) - s[B].bind(y, ty) - s[B].bind(z, bx) - s[AA].compute_at(s[B], y) - _, x = AA.op.axis - s[AA].bind(x, tx) - - dev = tvm.cuda(0) - # building with the CSE pass disabled as otherwise it would do some commoning - with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): - func = tvm.build(s, [A, B], "cuda") - A_np = np.array([list(range(i, m + i)) for i in range(n)], dtype=dtype) - B_np = np.array([list(range(1 + i, m + i)) + [i] for i in range(n)], dtype=dtype) - A_nd = tvm.nd.array(A_np, dev) - B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev) - func(A_nd, B_nd) - tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3) - - check_cuda("float32") - check_cuda("float16") - - -@tvm.testing.requires_gpu -@tvm.testing.requires_cuda -def test_lower_warp_memory_cuda_2_buffers(): - def check_cuda(dtype): - if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version): - print("Skip because gpu does not have fp16 support") - return - - m = 32 - A = te.placeholder((m,), name="A", dtype=dtype) - B = te.placeholder((m,), name="B", dtype=dtype) - C = te.compute((m,), lambda i: A[(i + 1) % m] + B[(i + 1) % m], name="C") - - cuda_target = tvm.target.Target("cuda", host="llvm") - assert m <= cuda_target.thread_warp_size - with cuda_target: - s = te.create_schedule(C.op) - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - - AA = s.cache_read(A, "warp", [C]) - BB = s.cache_read(B, "warp", [C]) - xo, xi = s[C].split(C.op.axis[0], nparts=1) - s[C].bind(xi, tx) - s[C].bind(xo, bx) - s[AA].compute_at(s[C], xo) - s[BB].compute_at(s[C], xo) - xo, xi = s[AA].split(s[AA].op.axis[0], nparts=1) - s[AA].bind(xo, bx) - s[AA].bind(xi, tx) - xo, xi = s[BB].split(s[BB].op.axis[0], nparts=1) - s[BB].bind(xo, bx) - s[BB].bind(xi, tx) - - dev = tvm.cuda(0) - # building with the CSE pass disabled as otherwise it would do some commoning - with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): - func = tvm.build(s, [A, B, C], "cuda") - AB_np = np.array(list(range(m)), dtype=dtype) - C_np = np.array(list(range(1, m)) + [0], dtype=dtype) * 2 - A_nd = tvm.nd.array(AB_np, dev) - B_nd = tvm.nd.array(AB_np, dev) - C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), dev) - func(A_nd, B_nd, C_nd) - tvm.testing.assert_allclose(C_nd.numpy(), C_np, rtol=1e-3) - - check_cuda("float32") - check_cuda("float16") - - -@tvm.testing.requires_gpu -def test_lower_warp_memory_roundup(): - def check(device, m): - A = te.placeholder((m,), name="A") - B = te.compute((m,), lambda i: A[i] + 1, name="B") - - with tvm.target.Target(device): - s = te.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=32) - tx = te.thread_axis("threadIdx.x") - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[B].bind(xi, tx) - - AA = s.cache_read(A, "warp", [B]) - _, yi = s[AA].split(s[AA].op.axis[0], factor=32) - s[AA].bind(yi, tx) - s[AA].compute_at(s[B], xo) - - dev = tvm.device(device, 0) - # building with the CSE pass disabled as otherwise it would do some commoning - with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): - func = tvm.build(s, [A, B], device) - A_np = np.random.uniform(size=(m,)).astype(A.dtype) - B_np = np.zeros(shape=(m,)).astype(B.dtype) - A_nd = tvm.nd.array(A_np, dev) - B_nd = tvm.nd.array(B_np, dev) - func(A_nd, B_nd) - B_np = A_np + 1 - tvm.testing.assert_allclose(B_nd.numpy(), B_np) - - for device in ["cuda", "rocm"]: - if not tvm.testing.device_enabled(device): - print("skip because", device, "is not enabled..") - continue - check(device, m=31) - check(device, m=32) - check(device, m=33) - check(device, m=63) - check(device, m=64) - check(device, m=65) - - -@tvm.testing.requires_cuda -def test_lower_warp_memory_same_thread(): - m = n = 128 - A = te.placeholder((m, n), name="A") - k = te.reduce_axis((0, n), name="k") - B = te.compute((m,), lambda i: te.sum(A[i, k], axis=[k])) - - s = te.create_schedule(B.op) - BB = s.cache_write(B, "warp") - tx = te.thread_axis("threadIdx.x") - xo, xi = s[B].split(B.op.axis[0], factor=32) - s[B].bind(xi, tx) - s[B].bind(xo, te.thread_axis("blockIdx.x")) - s[BB].compute_at(s[B], xo) - xo, xi = s[BB].split(s[BB].op.axis[0], factor=32) - s[BB].bind(xi, tx) - - # lowering with the CSE pass disabled as otherwise it would do some commoning - with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): - mod = tvm.lower(s, [A, B], name="f") - - mod = _run_passes(mod) - fdevice = mod["f_kernel"] - assert "tvm_warp_shuffle" not in fdevice.script() - - -@tvm.testing.requires_cuda -def test_lower_warp_memory_divide_by_factor(): - ib = tvm.tir.ir_builder.IRBuilder() - bx = te.thread_axis("blockIdx.x") - tx = te.thread_axis("threadIdx.x") - - with ib.new_scope(): - ib.scope_attr(bx, "thread_extent", 32) - ib.scope_attr(tx, "thread_extent", 32) - t = ib.allocate("float32", 16, name="t", scope="warp") - n = ib.allocate("float32", 16, name="n", scope="local") - n[0] = t[0] - - stmt = ib.get() - func = tvm.tir.PrimFunc([], stmt) - func = func.with_attr("from_legacy_te_schedule", True) - # lowering with the CSE pass disabled as otherwise it would do some commoning - with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]): - mod = tvm.lower(func, name="f") - with pytest.raises(tvm.error.TVMError, match="Divide by zero") as cm: - _run_passes(mod) - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/tir-transform/test_tir_transform_make_packed_api.py b/tests/python/tir-transform/test_tir_transform_make_packed_api.py index f783ab2fcef1..8605d5185d90 100644 --- a/tests/python/tir-transform/test_tir_transform_make_packed_api.py +++ b/tests/python/tir-transform/test_tir_transform_make_packed_api.py @@ -21,32 +21,6 @@ import tvm.testing from tvm import te, tir from tvm.script import tir as T, ir as I -from tvm.driver.build_module import schedule_to_module - - -def test_makeapi(): - """Not yet working, mock design""" - n = te.size_var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") - s = te.create_schedule(C.op) - - mod = schedule_to_module(s, [n, A, B, C]) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - mod = tvm.tir.transform.Apply( - lambda f: f.with_attr( - { - "target": tvm.target.Target("llvm", host="llvm"), - "global_symbol": "main", - } - ) - )(mod) - - before = mod - after = tvm.tir.transform.MakePackedAPI()(before) - f = after["main"] - assert len(f.params) == 6 def _find_assignment(stmt, var_name): diff --git a/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py b/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py index 9bb0aaf6e8e8..ee78dab2cbfe 100644 --- a/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py +++ b/tests/python/tir-transform/test_tir_transform_merge_dynamic_shared_memory_allocations.py @@ -19,314 +19,10 @@ import tvm import tvm.testing from tvm import te -from tvm.driver.build_module import schedule_to_module from tvm.topi.math import cast from tvm.script import tir as T -def run_passes(sch, args): - mod = schedule_to_module(sch, args) - return tvm.transform.Sequential( - [ - tvm.tir.transform.StorageFlatten(64), - tvm.tir.transform.Simplify(), - tvm.tir.transform.VectorizeLoop(), - tvm.tir.transform.StorageRewrite(), - tvm.tir.transform.MergeSharedMemoryAllocations(), - ] - )(mod) - - -def verify_single_allocation(stmt, alloc_size=None): - num_alloc = [0] - alloc_extents = [] - - def verify(n): - if ( - isinstance(n, tvm.tir.Allocate) - and n.buffer_var.type_annotation.storage_scope == "shared.dyn" - ): - num_alloc[0] += 1 - alloc_extents.append(n.extents[0]) - - tvm.tir.stmt_functor.post_order_visit(stmt, verify) - assert num_alloc[0] == 1 - - if alloc_size: - assert alloc_extents[0] == alloc_size - - -@tvm.testing.requires_gpu -def test_matmul_dyn_shared(): - n = 1024 - block = 16 - A = te.placeholder((n, n), name="A", dtype="float16") - B = te.placeholder((n, n), name="B", dtype="float16") - - def syncthread(): - return tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])) - - def test_matmul_ir(A, B, C): - ib = tvm.tir.ir_builder.create() - - tx = te.thread_axis("threadIdx.x") - ty = te.thread_axis("threadIdx.y") - bx = te.thread_axis("blockIdx.x") - by = te.thread_axis("blockIdx.y") - ib.scope_attr(tx, "thread_extent", block) - ib.scope_attr(ty, "thread_extent", block) - ib.scope_attr(bx, "thread_extent", n // block) - ib.scope_attr(by, "thread_extent", n // block) - - A_sh = ib.allocate(A.dtype, (block, block), scope="shared.dyn", name="A_sh") # fp16 - B_sh = ib.allocate(B.dtype, (block, block), scope="shared.dyn", name="B_sh") # fp16 - # Create a dynamic shared memory for the accumulation. - # This is for testing merging dynamic shared memory alloctions with different data type. - # In practice, there is no need to allocate a shared memory for C. - C_local = ib.allocate(C.dtype, (1,), scope="local", name="C_local") - C_sh = ib.allocate(C.dtype, (block, block), scope="shared.dyn", name="C_sh") # fp32 - - A_ptr = ib.buffer_ptr(A) - B_ptr = ib.buffer_ptr(B) - C_ptr = ib.buffer_ptr(C) - - C_local[0] = 0.0 - - with ib.for_range(0, n // block, name="i") as i: - A_sh[ty, tx] = A_ptr[by * block + ty, i * block + tx] - B_sh[ty, tx] = B_ptr[i * block + ty, bx * block + tx] - ib.emit(syncthread()) - - with ib.for_range(0, block, name="k") as k: - C_local[0] += cast(A_sh[ty, k] * B_sh[k, tx], "float32") - ib.emit(syncthread()) - - C_sh[ty, tx] = C_local[0] - C_ptr[by * block + ty, bx * block + tx] = C_sh[ty, tx] - - return ib.get() - - C = te.extern( - A.shape, - [A, B], - lambda ins, outs: test_matmul_ir(ins[0], ins[1], outs[0]), - name="matmul", - dtype="float32", - ) - s = te.create_schedule(C.op) - mod = run_passes(s, [A, B, C]) - # C can be allocated at the start of A, so we only need to allocate 2 block * block memory with dtype = float16 - expected_alloc_size = block * block * 4 - verify_single_allocation(mod["main"].body, expected_alloc_size) - - def check_target(target): - if not tvm.testing.device_enabled(target): - return - - fmatmul = tvm.build(s, [A, B, C], target) - dev = tvm.device(target, 0) - - size = (n, n) - a_np = np.random.uniform(size=size).astype(A.dtype) - b_np = np.random.uniform(size=size).astype(B.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(size, dtype=C.dtype), dev) - fmatmul(a, b, c) - np_ref = np.dot(a_np.astype("float32"), b_np.astype("float32")) - tvm.testing.assert_allclose(c.numpy(), np_ref, 1e-4, 1e-4) - - for target in ["cuda", "nvptx"]: - check_target(target) - - -@tvm.testing.requires_gpu -def test_dyn_shared_vectorized_store(): - """Test vectorized store into dynamic shared memory""" - n = te.size_var("n") - A = te.placeholder((n,), name="A", dtype="float16") - B = te.placeholder((n,), name="B", dtype="float32") - - def test_device_ir(A, B, C): - n = A.shape[0] - ib = tvm.tir.ir_builder.create() - - values_per_thread = 4 - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", tvm.tir.indexdiv(n, values_per_thread)) - - A_sh = ib.allocate(A.dtype, (n,), scope="shared.dyn") # fp16 - B_sh = ib.allocate(B.dtype, (n,), scope="shared.dyn") # fp32 - - Aptr = ib.buffer_ptr(A) - Bptr = ib.buffer_ptr(B) - Cptr = ib.buffer_ptr(C) - - with ib.for_range(0, values_per_thread, kind="vectorize") as i: - A_sh[tx * values_per_thread + i] = Aptr[tx * values_per_thread + i] - B_sh[tx * values_per_thread + i] = Bptr[tx * values_per_thread + i] - - with ib.for_range(0, values_per_thread) as i: - Cptr[tx * values_per_thread + i] = ( - cast(A_sh[tx * values_per_thread + i], "float32") + B_sh[tx * values_per_thread + i] - ) - - return ib.get() - - C = te.extern( - (n,), - [A, B], - lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]), - name="vadd", - dtype="float32", - ) - s = te.create_schedule(C.op) - - mod = run_passes(s, [A, B, C]) - verify_single_allocation(mod["main"].body) - - def check_target(target): - if not tvm.testing.device_enabled(target): - return - - fadd = tvm.build(s, [A, B, C], target) - dev = tvm.device(target, 0) - - for n in [512, 1024]: - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros((n,), dtype=C.dtype), dev) - fadd(a, b, c) - tvm.testing.assert_allclose( - c.numpy(), a.numpy().astype("float32") + b.numpy(), 1e-4, 1e-4 - ) - - for target in ["cuda", "nvptx"]: - check_target(target) - - -@tvm.testing.requires_gpu -def test_dyn_shared_reuse_and_merge(): - n = 64 - A = te.placeholder((n,), name="A", dtype="float32") - B = te.placeholder((n,), name="B", dtype="float32") - C = te.placeholder((te.size_var("n_dyn"),), name="C", dtype="float32") - - def test_device_ir(A, B, C, D): - ib = tvm.tir.ir_builder.create() - - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", n) - - A_sh = ib.allocate(A.dtype, (n,), scope="shared.dyn", name="A_sh") - B_sh = ib.allocate(B.dtype, (n,), scope="shared.dyn", name="B_sh") - C_sh = ib.allocate(C.dtype, (C.shape[0],), scope="shared.dyn", name="C_sh") - - Aptr = ib.buffer_ptr(A) - Bptr = ib.buffer_ptr(B) - Cptr = ib.buffer_ptr(C) - Dptr = ib.buffer_ptr(D) - - A_sh[tx] = Aptr[tx] - Dptr[tx] = A_sh[tx] - - B_sh[tx] = Bptr[tx] - Dptr[tx] += B_sh[tx] - - C_sh[tx] = Cptr[tx] # C cannot reuse other buffers since it size is dynamic - Dptr[tx] += C_sh[tx] - - return ib.get() - - D = te.extern( - (n,), - [A, B, C], - lambda ins, outs: test_device_ir(ins[0], ins[1], ins[2], outs[0]), - name="vadd", - dtype="float32", - ) - s = te.create_schedule(D.op) - - mod = run_passes(s, [A, B, C, D]) - # merged allocation - # allocate(buf_dyn_shmem: Pointer(shared.dyn uint8), uint8, [((n_dyn*4) + 256)]); - verify_single_allocation(mod["main"].body) - - def check_target(target): - if not tvm.testing.device_enabled(target): - return - - fadd = tvm.build(s, [A, B, C, D], target) - dev = tvm.device(target, 0) - - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), dev) - d = tvm.nd.array(np.zeros((n,), dtype=D.dtype), dev) - fadd(a, b, c, d) - tvm.testing.assert_allclose(d.numpy(), a.numpy() + b.numpy() + c.numpy(), 1e-4, 1e-4) - - for target in ["cuda", "nvptx"]: - check_target(target) - - -def test_dyn_shared_more_dtype(): - """Test vectorized store into dynamic shared memory""" - n = 512 - A = te.placeholder((n,), name="A", dtype="int8") - B = te.placeholder((n,), name="B", dtype="int16") - - def test_device_ir(A, B, C): - n = A.shape[0] - ib = tvm.tir.ir_builder.create() - - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", n) - - A_sh = ib.allocate(A.dtype, (n,), scope="shared.dyn") # i8 - B_sh = ib.allocate(B.dtype, (n,), scope="shared.dyn") # i16 - C_sh = ib.allocate(C.dtype, (n,), scope="shared.dyn") # i32 - - Aptr = ib.buffer_ptr(A) - Bptr = ib.buffer_ptr(B) - Cptr = ib.buffer_ptr(C) - - A_sh[tx] = Aptr[tx] - B_sh[tx] = Bptr[tx] - - C_sh[tx] = cast(A_sh[tx], "int32") + cast(B_sh[tx], "int32") - Cptr[tx] = C_sh[tx] - return ib.get() - - C = te.extern( - (n,), - [A, B], - lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]), - name="vadd", - dtype="int32", - ) - s = te.create_schedule(C.op) - - mod = run_passes(s, [A, B, C]) - verify_single_allocation(mod["main"].body, n * 4) - - def check_target(target): - if not tvm.testing.device_enabled(target): - return - - fadd = tvm.build(s, [A, B, C], target) - dev = tvm.device(target, 0) - - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros((n,), dtype=C.dtype), dev) - fadd(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy().astype("float32") + b.numpy(), 1e-4, 1e-4) - - for target in ["cuda", "nvptx"]: - check_target(target) - - class TestMatmul(tvm.testing.CompareBeforeAfter): """Shared allocations should be merged, preserving DeclBuffer if present diff --git a/tests/python/tir-transform/test_tir_transform_merge_static_shared_memory_allocations.py b/tests/python/tir-transform/test_tir_transform_merge_static_shared_memory_allocations.py deleted file mode 100644 index be32514a720c..000000000000 --- a/tests/python/tir-transform/test_tir_transform_merge_static_shared_memory_allocations.py +++ /dev/null @@ -1,203 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import numpy as np - -import tvm -import tvm.testing -from tvm import te -from tvm.driver.build_module import schedule_to_module -from tvm.topi.math import cast -from tvm.script import tir as T - - -def run_passes(sch, args): - mod = schedule_to_module(sch, args) - with tvm.transform.PassContext(config={"tir.merge_static_smem": True}): - return tvm.transform.Sequential( - [ - tvm.tir.transform.StorageFlatten(64), - tvm.tir.transform.Simplify(), - tvm.tir.transform.VectorizeLoop(), - tvm.tir.transform.StorageRewrite(), - tvm.tir.transform.MergeSharedMemoryAllocations(), - ] - )(mod) - - -def verify_single_allocation(stmt, alloc_size=None): - num_alloc = [0] - alloc_extents = [] - - def verify(n): - if ( - isinstance(n, tvm.tir.Allocate) - and n.buffer_var.type_annotation.storage_scope == "shared" - ): - num_alloc[0] += 1 - alloc_extents.append(n.extents[0]) - - tvm.tir.stmt_functor.post_order_visit(stmt, verify) - assert num_alloc[0] == 1 - - if alloc_size: - assert alloc_extents[0] == alloc_size - - -@tvm.testing.requires_gpu -def test_matmul_shared(): - n = 1024 - block = 16 - A = te.placeholder((n, n), name="A", dtype="float16") - B = te.placeholder((n, n), name="B", dtype="float16") - - def syncthread(): - return tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"])) - - def test_matmul_ir(A, B, C): - ib = tvm.tir.ir_builder.create() - - tx = te.thread_axis("threadIdx.x") - ty = te.thread_axis("threadIdx.y") - bx = te.thread_axis("blockIdx.x") - by = te.thread_axis("blockIdx.y") - ib.scope_attr(tx, "thread_extent", block) - ib.scope_attr(ty, "thread_extent", block) - ib.scope_attr(bx, "thread_extent", n // block) - ib.scope_attr(by, "thread_extent", n // block) - - A_sh = ib.allocate(A.dtype, (block, block), scope="shared", name="A_sh") # fp16 - B_sh = ib.allocate(B.dtype, (block, block), scope="shared", name="B_sh") # fp16 - # Create a shared memory for the accumulation. - # This is for testing merging shared memory alloctions with different data type. - # In practice, there is no need to allocate a shared memory for C. - C_local = ib.allocate(C.dtype, (1,), scope="local", name="C_local") - C_sh = ib.allocate(C.dtype, (block, block), scope="shared", name="C_sh") # fp32 - - A_ptr = ib.buffer_ptr(A) - B_ptr = ib.buffer_ptr(B) - C_ptr = ib.buffer_ptr(C) - - C_local[0] = 0.0 - - with ib.for_range(0, n // block, name="i") as i: - A_sh[ty, tx] = A_ptr[by * block + ty, i * block + tx] - B_sh[ty, tx] = B_ptr[i * block + ty, bx * block + tx] - ib.emit(syncthread()) - - with ib.for_range(0, block, name="k") as k: - C_local[0] += cast(A_sh[ty, k] * B_sh[k, tx], "float32") - ib.emit(syncthread()) - - C_sh[ty, tx] = C_local[0] - C_ptr[by * block + ty, bx * block + tx] = C_sh[ty, tx] - - return ib.get() - - C = te.extern( - A.shape, - [A, B], - lambda ins, outs: test_matmul_ir(ins[0], ins[1], outs[0]), - name="matmul", - dtype="float32", - ) - s = te.create_schedule(C.op) - mod = run_passes(s, [A, B, C]) - # C can be allocated at the start of A, so we only need to allocate 2 block * block memory with dtype = float16 - expected_alloc_size = block * block * 4 - verify_single_allocation(mod["main"].body, expected_alloc_size) - - def check_target(target): - if not tvm.testing.device_enabled(target): - return - - fmatmul = tvm.build(s, [A, B, C], target) - dev = tvm.device(target, 0) - - size = (n, n) - a_np = np.random.uniform(size=size).astype(A.dtype) - b_np = np.random.uniform(size=size).astype(B.dtype) - a = tvm.nd.array(a_np, dev) - b = tvm.nd.array(b_np, dev) - c = tvm.nd.array(np.zeros(size, dtype=C.dtype), dev) - fmatmul(a, b, c) - np_ref = np.dot(a_np.astype("float32"), b_np.astype("float32")) - tvm.testing.assert_allclose(c.numpy(), np_ref, 1e-4, 1e-4) - - for target in ["cuda"]: - check_target(target) - - -@tvm.testing.requires_gpu -def test_shared_more_dtype(): - """Test vectorized store into shared memory""" - n = 512 - A = te.placeholder((n,), name="A", dtype="int8") - B = te.placeholder((n,), name="B", dtype="int16") - - def test_device_ir(A, B, C): - n = A.shape[0] - ib = tvm.tir.ir_builder.create() - - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", n) - - A_sh = ib.allocate(A.dtype, (n,), scope="shared") # i8 - B_sh = ib.allocate(B.dtype, (n,), scope="shared") # i16 - C_sh = ib.allocate(C.dtype, (n,), scope="shared") # i32 - - Aptr = ib.buffer_ptr(A) - Bptr = ib.buffer_ptr(B) - Cptr = ib.buffer_ptr(C) - - A_sh[tx] = Aptr[tx] - B_sh[tx] = Bptr[tx] - - C_sh[tx] = cast(A_sh[tx], "int32") + cast(B_sh[tx], "int32") - Cptr[tx] = C_sh[tx] - return ib.get() - - C = te.extern( - (n,), - [A, B], - lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]), - name="vadd", - dtype="int32", - ) - s = te.create_schedule(C.op) - - mod = run_passes(s, [A, B, C]) - verify_single_allocation(mod["main"].body, n * 4) - - def check_target(target): - if not tvm.testing.device_enabled(target): - return - - fadd = tvm.build(s, [A, B, C], target) - dev = tvm.device(target, 0) - - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros((n,), dtype=C.dtype), dev) - fadd(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy().astype("float32") + b.numpy(), 1e-4, 1e-4) - - for target in ["cuda"]: - check_target(target) - - -if __name__ == "__main__": - tvm.testing.main() diff --git a/tests/python/tir-transform/test_tir_transform_narrow_datatype.py b/tests/python/tir-transform/test_tir_transform_narrow_datatype.py index 5ebdbe986082..93c680c846c5 100644 --- a/tests/python/tir-transform/test_tir_transform_narrow_datatype.py +++ b/tests/python/tir-transform/test_tir_transform_narrow_datatype.py @@ -16,7 +16,6 @@ # under the License. import tvm from tvm import te -from tvm.driver.build_module import schedule_to_module from tvm.script import tir as T from tvm.tir import const import tvm.testing @@ -163,27 +162,6 @@ def check(m, lanes, target_bits, target_dtype): check(const(2**16, dtype="int32"), 2, target_bits=16, target_dtype="int32") -def test_reduce(): - def check(m, target_bits, target_dtype): - A = te.placeholder((m,), name="A", dtype="float32") - k = te.reduce_axis((0, m), "k") - B = te.compute((), lambda *idx: te.sum(A[k], axis=k), name="B") - s = te.create_schedule(B.op) - stmt = lower_sch(s, [A, B], target_bits) - assert stmt[1].loop_var.dtype == target_dtype - - # i32 -> i32 - check(const(64, dtype="int32"), 32, "int32") - # i64 -> i32 - check(const(64, dtype="int64"), 32, "int32") - # i32 -> i16 - check(const(64, dtype="int32"), 16, "int16") - check(const(2**16, dtype="int32"), 16, "int32") - # symbolic - check(te.var("n", dtype="int32"), 32, "int32") - check(te.var("n", dtype="int64"), 32, "int64") - - def test_slice(): def check(m, n, target_bits, target_dtype): # The index may overflow in B, while not in A @@ -208,25 +186,6 @@ def check(m, n, target_bits, target_dtype): ) -def test_ramp_dtype_consistency(): - """ - for (i :int64, (int64)0, (int64)4) { - A[ramp(i*(int64)2, (int64)1, 2)] = cast(int64, 2 ** 31 - 1) * i; - } - The infer result: - base: int64 -> int64 (since i is involved in another int64 expr) - stride: int64 -> int32 - - Thus ramp should still use int64 for both stride and base after rewrite. - """ - n = tvm.tir.IntImm("int64", 4) - m = tvm.tir.IntImm("int64", 2) - A = te.compute((n, m), lambda i, j: tvm.tir.Cast("int64", 2**31 - 1) * i, name="A") - s = te.create_schedule(A.op) - s[A].vectorize(A.op.axis[1]) - lower_sch(s, [A], 32, extra_passes=[tvm.tir.transform.VectorizeLoop()]) - - def test_condition(): @T.prim_func def before(A: T.Buffer((128,), "float32"), B: T.Buffer((130,), "float32")): diff --git a/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py b/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py index 1a1e780a7272..8500f114610c 100644 --- a/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py +++ b/tests/python/tir-transform/test_tir_transform_plan_update_buffer_allocation_location.py @@ -236,17 +236,6 @@ def test_opaque_access(): _check(opaque_access, transformed_opaque_access) -def test_lower_te(): - x = te.placeholder((1,)) - y = te.compute((1,), lambda i: x[i] + 2) - s = te.create_schedule(y.op) - orig_mod = tvm.driver.build_module.schedule_to_module(s, [x, y]) - mod = tvm.tir.transform.PlanAndUpdateBufferAllocationLocation()(orig_mod) - tvm.ir.assert_structural_equal( - mod, orig_mod - ) # PlanAndUpdateBufferAllocationLocation should do nothing on TE - - def test_loop_carried_dependency(): """The buffer allocation should be above opaque iter var's loop scopes such that buffer accesses with loop carried dependencies are covered, diff --git a/tests/python/tir-transform/test_tir_transform_simplify.py b/tests/python/tir-transform/test_tir_transform_simplify.py index 0b2d5f16d833..bbd69d01cbb4 100644 --- a/tests/python/tir-transform/test_tir_transform_simplify.py +++ b/tests/python/tir-transform/test_tir_transform_simplify.py @@ -73,69 +73,6 @@ def test_if_likely(): assert not isinstance(body.body.body.then_case, tvm.tir.IfThenElse) -def test_basic_likely_elimination(): - n = te.size_var("n") - X = te.placeholder(shape=(n,), name="x") - W = te.placeholder(shape=(n + 1,), dtype="int32", name="w") - - def f(i): - start = W[i] - extent = W[i + 1] - W[i] - rv = te.reduce_axis((0, extent)) - return te.sum(X[rv + start], axis=rv) - - Y = te.compute(X.shape, f, name="y") - s = te.create_schedule([Y.op]) - stmt = tvm.lower(s, [X, W, Y], simple_mode=True) - assert "if" not in str(stmt) - - -def test_complex_likely_elimination(): - def cumsum(X): - """ - Y[i] = sum(X[:i]) - """ - (m,) = X.shape - s_state = te.placeholder((m + 1,), dtype="int32", name="state") - s_init = te.compute((1,), lambda _: tvm.tir.const(0, "int32")) - s_update = te.compute((m + 1,), lambda l: s_state[l - 1] + X[l - 1]) - return tvm.te.scan(s_init, s_update, s_state, inputs=[X], name="cumsum") - - def sparse_lengths_sum(data, indices, lengths): - oshape = list(data.shape) - oshape[0] = lengths.shape[0] - length_offsets = cumsum(lengths) - - def sls(n, d): - gg = te.reduce_axis((0, lengths[n])) - indices_idx = length_offsets[n] + gg - data_idx = indices[indices_idx] - data_val = data[data_idx, d] - return te.sum(data_val, axis=gg) - - return te.compute(oshape, sls) - - m, n, d, i, l = ( - te.size_var("m"), - te.size_var("n"), - te.size_var("d"), - te.size_var("i"), - te.size_var("l"), - ) - data_ph = te.placeholder((m, d * 32), name="data") - indices_ph = te.placeholder((i,), name="indices", dtype="int32") - lengths_ph = te.placeholder((n,), name="lengths", dtype="int32") - Y = sparse_lengths_sum(data_ph, indices_ph, lengths_ph) - s = te.create_schedule([Y.op]) - (n, d) = s[Y].op.axis - (do, di) = s[Y].split(d, factor=32) - (gg,) = s[Y].op.reduce_axis - s[Y].reorder(n, do, gg, di) - s[Y].vectorize(di) - stmt = tvm.lower(s, [data_ph, indices_ph, lengths_ph, Y], simple_mode=True) - assert "if" not in str(stmt) - - class BaseBeforeAfter(tvm.testing.CompareBeforeAfter): transitively_prove_inequalities = False convert_boolean_to_and_of_ors = False @@ -668,7 +605,6 @@ def expected(self, test_case): priors = analyzer.canonical_simplify(priors) if provable: - # well formed checker complains of undefined variables in condition @T.prim_func(check_well_formed=False) def func(A: T.Buffer(1, "bool")): diff --git a/tests/python/tir-transform/test_tir_transform_split_host_device.py b/tests/python/tir-transform/test_tir_transform_split_host_device.py index 2d0d8a68d83e..a7ea6d8cdd46 100644 --- a/tests/python/tir-transform/test_tir_transform_split_host_device.py +++ b/tests/python/tir-transform/test_tir_transform_split_host_device.py @@ -21,45 +21,6 @@ from tvm.script import tir as T -@tvm.testing.requires_cuda -def test_split_host_device_func_attr(): - m = te.size_var("m") - l = te.size_var("l") - A = te.placeholder((m, l), name="A") - - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - - s = te.create_schedule(A2.op) - xo, xi = s[A2].split(A2.op.axis[0], factor=8) - s[A2].bind(xo, te.thread_axis("blockIdx.x")) - s[A1].compute_at(s[A2], xo) - s[A1].set_scope("shared") - - mod = tvm.lower(s, [A, A2]) - - cuda_target = tvm.target.Target("cuda", host="llvm") - mod = tvm.tir.transform.Apply( - lambda f: f.with_attr({"global_symbol": "test", "target": cuda_target}) - )(mod) - - mod = tvm.ir.transform.Sequential( - [ - tvm.tir.transform.AnnotateDeviceRegions(), - tvm.tir.transform.SplitHostDevice(), - tvm.tir.transform.MakePackedAPI(), - tvm.tir.transform.LowerDeviceKernelLaunch(), - ] - )(mod) - - fdevice = mod["test_kernel"] - - assert fdevice.attrs["global_symbol"] == "test_kernel" - assert fdevice.attrs["calling_conv"].value == 2 - assert str(fdevice.attrs["target"]) == str(tvm.target.Target("cuda")) - assert fdevice.attrs["tir.is_global_func"].value - - def test_ssa_across_entire_module(): """The host and device functions should not share TIR vars diff --git a/tests/python/tir-transform/test_tir_transform_storage_flatten.py b/tests/python/tir-transform/test_tir_transform_storage_flatten.py index 4a81ab93c763..2c97cc53af67 100644 --- a/tests/python/tir-transform/test_tir_transform_storage_flatten.py +++ b/tests/python/tir-transform/test_tir_transform_storage_flatten.py @@ -17,72 +17,9 @@ import tvm import tvm.testing from tvm import te -from tvm.driver.build_module import schedule_to_module from tvm.script import tir as T -def test_flatten2(): - m = te.size_var("m") - l = te.size_var("l") - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - - s = te.create_schedule(A2.op) - xo, xi = s[A2].split(A2.op.axis[0], 8) - s[A1].compute_at(s[A2], xo) - Ab = tvm.tir.decl_buffer(A.shape, A.dtype, name="A") - A2b = tvm.tir.decl_buffer(A2.shape, A2.dtype, name="A2") - - mod = schedule_to_module(s, [Ab, A2b], binds={A: Ab, A2: A2b}) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - - -def test_flatten_prefetch(): - A = te.placeholder((25, 100, 4), name="A") - _A = tvm.tir.decl_buffer(A.shape, A.dtype, name="A") - i = te.size_var("i") - j = te.size_var("j") - region = [tvm.ir.Range.from_min_extent(i[0], i[1]) for i in [(i, 2), (j, 8), (0, 4)]] - stmt = tvm.tir.Prefetch(_A, region) - - func = tvm.te.schedule.SchedulePostProcToPrimFunc([_A], stmt, {A: _A}) - - mod = tvm.IRModule.from_expr(func) - mod = tvm.transform.Sequential( - [tvm.tir.transform.StorageFlatten(64), tvm.tir.transform.Simplify()] - )(mod) - stmt = mod["main"].body - assert stmt.extent.value == 2 - assert isinstance(stmt.body, tvm.tir.For) - assert stmt.body.extent.value == 2 - - def assert_flat_loads(stmt): - if isinstance(stmt, tvm.tir.BufferLoad): - assert len(stmt.indices) == 1, "All prefetch indices should be flattened" - - tvm.tir.stmt_functor.post_order_visit(stmt, assert_flat_loads) - - -def test_flatten_storage_align(): - m = 8 - l = 16 - A = te.placeholder((m, l), name="A") - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - - s = te.create_schedule(A2.op) - s[A1].storage_align(A1.op.axis[0], 2, 1) - - mod = schedule_to_module(s, [A, A2]) - mod = tvm.transform.Sequential( - [tvm.tir.transform.StorageFlatten(64), tvm.tir.transform.Simplify()] - )(mod) - - stmt = mod["main"].body - assert stmt.extents[0].value == 17 * 8 - - def test_flatten_double_buffer(): @tvm.script.ir_module class ModFromScript: diff --git a/tests/python/tir-transform/test_tir_transform_storage_rewrite.py b/tests/python/tir-transform/test_tir_transform_storage_rewrite.py index 68149e7d64bb..ab91c6c7b330 100644 --- a/tests/python/tir-transform/test_tir_transform_storage_rewrite.py +++ b/tests/python/tir-transform/test_tir_transform_storage_rewrite.py @@ -21,39 +21,9 @@ import tvm import tvm.testing from tvm import te -from tvm.driver.build_module import schedule_to_module from tvm.script import tir as T -def test_storage_share(): - m = te.var("m") - l = te.var("l") - A = te.placeholder((m, l), name="A") - num_stage = 5 - B = A - for t in range(num_stage): - B = te.compute((m, l), lambda i, j: B[i, j] + (t + 1), name="A%d" % t) - - s = te.create_schedule(B.op) - mod = schedule_to_module(s, [A, B]) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - - mod = tvm.tir.transform.Simplify()(mod) - mod = tvm.tir.transform.StorageRewrite()(mod) - stmt = mod["main"].body - - # verify only have one allocations. - # verify inplace folding works - num_alloc = [0] - - def verify(n): - if isinstance(n, tvm.tir.Allocate): - num_alloc[0] += 1 - - tvm.tir.stmt_functor.post_order_visit(stmt, verify) - assert num_alloc[0] == 1 - - def register_mem(scope_tb, max_bits): # Register mem @tvm.register_func("tvm.info.mem.%s" % scope_tb) @@ -163,103 +133,6 @@ def verify(n): dtype_test(dtype_list, length) -def test_inplace_rule(): - m = 10 - A = te.placeholder((m,), name="A") - A0 = te.compute((m,), lambda i: A[i], name="A0") - A1 = te.compute((m,), lambda i: A[i] + 1, name="A1") - AA = te.compute((m,), lambda i: A0[i] + A1[i] + A1[0], name="AA") - B = te.compute((m,), lambda i: AA[i] + 1, name="B") - s = te.create_schedule(B.op) - mod = schedule_to_module(s, [A, B]) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - - mod = tvm.tir.transform.Simplify()(mod) - mod = tvm.tir.transform.StorageRewrite()(mod) - stmt = mod["main"].body - - # verify only have one allocations. - # verify inplace folding works - num_alloc = [0] - - def verify(n): - if isinstance(n, tvm.tir.Allocate): - num_alloc[0] += 1 - - tvm.tir.stmt_functor.post_order_visit(stmt, verify) - assert num_alloc[0] == 2 - - -def test_storage_combine(): - n = 8 - A = te.placeholder((4,), name="A") - num_stage = 5 - B = A - stages = [] - for t in range(num_stage): - B = te.compute((n,), lambda i: B[i] + B[0] + (t + 1), name="A%d" % t) - stages.append(B) - - s = te.create_schedule(B.op) - for S in stages[:-1]: - s[S].set_scope("global:tag") - - mod = schedule_to_module(s, [A, B]) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - - mod = tvm.tir.transform.Simplify()(mod) - mod = tvm.tir.transform.StorageRewrite()(mod) - stmt = mod["main"].body - - num_alloc = [0] - - def verify(n): - if isinstance(n, tvm.tir.Allocate): - num_alloc[0] += 1 - assert n.extents[0].value == 16 - - tvm.tir.stmt_functor.post_order_visit(stmt, verify) - assert num_alloc[0] == 1 - - -def test_storage_combine_with_vectorization(): - n = 1024 - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute((n,), lambda i: A[i] + B[i], name="C") - s = te.create_schedule(C.op) - AA = s.cache_read(A, "global:tag", readers=[C]) - BB = s.cache_read(B, "global:tag", readers=[C]) - CC = s.cache_write(C, "global:tag") - s[CC].vectorize(s[CC].op.axis[0]) - mod = schedule_to_module(s, [A, B, C]) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - mod = tvm.tir.transform.VectorizeLoop()(mod) - mod = tvm.tir.transform.StorageRewrite()(mod) - mod = tvm.tir.transform.Simplify()(mod) - stmt = mod["main"].body - num_alloc = [0] - - def verify(v): - # find add op - if ( - isinstance(v, tvm.tir.Add) - and isinstance(v.a, tvm.tir.BufferLoad) - and isinstance(v.b, tvm.tir.BufferLoad) - ): - lhs_ramp = v.a.indices[0] - rhs_ramp = v.b.indices[0] - # these two ramp load should not overlap - assert lhs_ramp.lanes == n - assert rhs_ramp.lanes == n - assert lhs_ramp.base >= rhs_ramp.base + n or rhs_ramp.base >= lhs_ramp.base + n - elif isinstance(v, tvm.tir.Allocate): - num_alloc[0] += 1 - - tvm.tir.stmt_functor.post_order_visit(stmt, verify) - assert num_alloc[0] == 1 - - def test_address_of(): # In this test, the storage rewrite pass is allowed to # combine buffers B and D, but not C @@ -313,40 +186,6 @@ def verify(n): assert total_alloc[0] == 16 -def test_storage_share_gpu(): - m = te.var("m") - A = [te.placeholder((m), name="A")] - num_stage = 5 - for t in range(num_stage): - A.append(te.compute((m,), lambda i: A[-1][i] + (t + 1), name="A%d_s" % t)) - A.append(te.compute((m,), lambda i: A[-1][i], name="A%d" % t)) - s = te.create_schedule(A[-1].op) - for t in range(num_stage): - x = A[2 * t + 2].op.axis[0] - bx, tx = s[A[2 * t + 2]].split(x, factor=32) - s[A[2 * t + 2]].bind(bx, te.thread_axis("blockIdx.x")) - s[A[2 * t + 2]].bind(tx, te.thread_axis("threadIdx.x")) - s[A[2 * t + 1]].compute_at(s[A[2 * t + 2]], tx) - s[A[2 * t + 1]].set_scope("shared") - - mod = schedule_to_module(s, [A[0], A[-1]]) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - mod = tvm.tir.transform.Simplify()(mod) - mod = tvm.tir.transform.StorageRewrite()(mod) - stmt = mod["main"].body - - alloc_stats = {"global": 0, "shared": 0} - - def verify(n): - if isinstance(n, tvm.tir.Allocate): - scope = n.buffer_var.type_annotation.storage_scope - alloc_stats[scope] += 1 - - tvm.tir.stmt_functor.post_order_visit(stmt, verify) - assert alloc_stats["global"] == 2 - assert alloc_stats["shared"] == num_stage - - def test_parallel_alloc(): ib = tvm.tir.ir_builder.create() n = te.var("n") @@ -443,125 +282,6 @@ def get_mod(kind="serial"): assert isinstance(body.body.body, tvm.tir.Allocate) # A -def test_inplace_rule2(scope_tb="local_TB2", max_bits=1024 * 1024 * 1024): - # Test Buffer - register_mem(scope_tb, max_bits) - m = 10 - A = te.placeholder((m,), name="A") - C = te.placeholder((m,), name="C") - D = te.placeholder((m,), name="D") - A0 = te.compute((m,), lambda i: A[i] + C[i], name="A0") - A1 = te.compute((m,), lambda i: D[i] * D[i], name="A1") - A2 = te.compute((m,), lambda i: A0[i] + A1[i], name="A2") - B = te.compute((m,), lambda i: A2[i], name="B") - s = te.create_schedule(B.op) - A0L = s.cache_read(A0, scope_tb, [A2]) - A1L = s.cache_read(A1, scope_tb, [A2]) - A2L = s.cache_read(A2, scope_tb, [B]) - mod = schedule_to_module(s, [A, B, C, D]) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - - mod = tvm.tir.transform.Simplify()(mod) - mod = tvm.tir.transform.StorageRewrite()(mod) - stmt = mod["main"].body - - # verify only have one allocations. - # verify inplace folding works - num_alloc = [0] - - def verify(n): - if isinstance(n, tvm.tir.Allocate): - num_alloc[0] += 1 - - tvm.tir.stmt_functor.post_order_visit(stmt, verify) - assert num_alloc[0] == 2 - - -def test_exceed_mem(): - max_bits = 639 - # The critical max_num_bits is between 639 and 640 - loc = -1 - try: - test_inplace_rule2("local_TEM", max_bits) - except Exception as e: - estr = str(e) - loc = estr.find("Allocation exceed bound of memory") - assert loc != -1 - - -def test_inplace_rule3(): - # Test Buffer - scope_tb = "local_TB3" - max_bits = 1024 * 1024 * 1024 - - register_mem(scope_tb, max_bits) - m = 10 - B0 = te.placeholder((m,), name="B0") - B1 = te.placeholder((m,), name="B1") - B2 = te.placeholder((m,), name="B2") - B3 = te.placeholder((m,), name="B3") - B4 = te.placeholder((m,), name="B4") - B5 = te.placeholder((m,), name="B5") - - B6 = te.compute((m,), lambda i: B1[i] * B5[i], name="B6") - B7 = te.compute((m,), lambda i: B2[i] * B4[i], name="B7") - B8 = te.compute((m,), lambda i: B6[i] - B7[i], name="B8") - - B9 = te.compute((m,), lambda i: B2[i] * B3[i], name="B9") - B10 = te.compute((m,), lambda i: B0[i] * B5[i], name="B10") - B11 = te.compute((m,), lambda i: B9[i] - B10[i], name="B11") - - B12 = te.compute((m,), lambda i: B0[i] * B4[i], name="B12") - B13 = te.compute((m,), lambda i: B1[i] * B3[i], name="B13") - B14 = te.compute((m,), lambda i: B12[i] - B13[i], name="B14") - - B = te.compute((m,), lambda i: B8[i] * B11[i] + B14[i], name="B") - s = te.create_schedule(B.op) - - B1L = s.cache_read(B1, scope_tb, [B6, B13]) - B5L = s.cache_read(B5, scope_tb, [B6, B10]) - B2L = s.cache_read(B2, scope_tb, [B7, B9]) - B4L = s.cache_read(B4, scope_tb, [B7, B12]) - B3L = s.cache_read(B3, scope_tb, [B9, B13]) - B0L = s.cache_read(B0, scope_tb, [B10, B12]) - - B8L = s.cache_write(B8, scope_tb) - B11L = s.cache_write(B11, scope_tb) - B14L = s.cache_write(B14, scope_tb) - B6L = s.cache_write(B6, scope_tb) - B7L = s.cache_write(B7, scope_tb) - B9L = s.cache_write(B9, scope_tb) - B10L = s.cache_write(B10, scope_tb) - B12L = s.cache_write(B12, scope_tb) - B13L = s.cache_write(B13, scope_tb) - - s[B12].compute_inline() - s[B13].compute_inline() - s[B8].compute_inline() - s[B11].compute_inline() - s[B14].compute_inline() - s[B6].compute_inline() - s[B7].compute_inline() - s[B9].compute_inline() - s[B10].compute_inline() - - s = s.normalize() - mod = schedule_to_module(s, [B0, B1, B2, B3, B4, B5, B]) - mod = tvm.tir.transform.StorageFlatten(64)(mod) - - mod = tvm.tir.transform.Simplify()(mod) - mod = tvm.tir.transform.StorageRewrite()(mod) - stmt = mod["main"].body - - # verify only have one allocations. - # verify inplace folding works - def verify(n): - if isinstance(n, tvm.tir.Allocate): - assert n.extents[0].value == 70 - - tvm.tir.stmt_functor.post_order_visit(stmt, verify) - - def test_alloc_seq_type(): ib = tvm.tir.ir_builder.create() n = te.var("n") @@ -665,46 +385,6 @@ def verify(n): assert num_alloc[0] == 1 -def test_replace_dataflow(): - shape = (255,) - A = te.placeholder(shape, name="A") - B = te.compute(shape, lambda i: A[i] + A[i], name="B") - C = te.compute(shape, lambda i: A[i] + B[i], name="C") - D = te.compute(shape, lambda i: A[i] + C[i], name="D") - E = te.compute(shape, lambda i: A[i] + D[i], name="E") - - s = te.create_schedule(E.op) - s.cache_read(A, "local", [B, C, D, E]) - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - - -def test_large_input(): - @te.hybrid.script - def compute(a, b): - n = 16384 - c = output_tensor((n, n), "int32") - for i in range(n): - for j in range(n): - c[i, j] = a[i, j] - b[i, j] - return c - - n = 16384 - shape = (n, n) - a = te.placeholder(shape, name="a", dtype="int32") - b = te.placeholder(shape, name="b", dtype="int32") - c = te.compute(shape, lambda i, j: compute(a, b)[i, j]) - c = te.compute(shape, lambda i, j: 1 + c[i, j]) - s = te.create_schedule(c.op) - stmt = tvm.lower(s, [a, b, c])["main"].body - - def verify(n): - if isinstance(n, tvm.tir.Allocate): - assert n.extents[0].value == 268435456 - - tvm.tir.stmt_functor.post_order_visit(stmt, verify) - - def test_access_in_let_value(): @T.prim_func def func(A: T.Buffer((8,), "float32")): diff --git a/tests/python/tir-transform/test_tir_transform_thread_sync.py b/tests/python/tir-transform/test_tir_transform_thread_sync.py index 5c43d8d96aa1..4ca33424c1d5 100644 --- a/tests/python/tir-transform/test_tir_transform_thread_sync.py +++ b/tests/python/tir-transform/test_tir_transform_thread_sync.py @@ -35,67 +35,6 @@ def run_passes(func: tvm.tir.PrimFunc): return tvm.tir.transform.ThreadSync("shared")(mod) -@tvm.testing.requires_cuda -def test_thread_storage_sync(): - m = te.size_var("m") - l = te.size_var("l") - A = te.placeholder((m, l), name="A") - - A1 = te.compute((m, l), lambda i, j: A[i, j], name="A1") - A2 = te.compute((m, l), lambda i, j: A1[i, j] + 3, name="A2") - - s = te.create_schedule(A2.op) - xo, xi = s[A2].split(A2.op.axis[0], factor=8) - s[A2].bind(xo, te.thread_axis("blockIdx.x")) - s[A1].compute_at(s[A2], xo) - s[A1].set_scope("shared") - - bounds = tvm.te.schedule.InferBound(s) - assert isinstance(bounds, tvm.container.Map) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) - - func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, A2], stmt, None) - mod = run_passes(func) - f = mod["test_kernel"] - body_list = tvm.tir.stmt_list(f.body.body.body.body.body.body) - assert body_list[1].value.op.same_as(tvm.ir.Op.get("tir.tvm_storage_sync")) - - -@tvm.testing.requires_cuda -def test_sync_else_branch(): - def ir(A, B): - ib = tvm.tir.ir_builder.create() - Aptr = ib.buffer_ptr(A) - Bptr = ib.buffer_ptr(B) - - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", 1) - - local = ib.allocate(A.dtype, (8,), name="buf_local", scope="local") - shared = ib.allocate(A.dtype, (8,), name="buf_shared", scope="shared") - - with ib.for_range(0, 8) as i: - with ib.if_scope(Aptr[i] < 0): - local[i] = Aptr[i] - with ib.else_scope(): - shared[i] = Aptr[i] - - with ib.for_range(0, 8) as i: - with ib.if_scope(Aptr[i] < 0): - Bptr[i] = local[i] - with ib.else_scope(): - Bptr[i] = shared[i] - - return ib.get() - - A = tvm.tir.decl_buffer((8,), "float32") - B = tvm.tir.decl_buffer((8,), "float32") - stmt = ir(A, B) - func = tvm.te.schedule.SchedulePostProcToPrimFunc([A, B], stmt, None) - mod = run_passes(func) - assert "T.tvm_storage_sync" in str(mod) - - @tvm.testing.requires_cuda def test_sync_read_thread_id_independent_location(): @T.prim_func diff --git a/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py b/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py index 9ee86433128d..a419dc3f9976 100644 --- a/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py +++ b/tests/python/tir-transform/test_tir_transform_unify_thread_binding.py @@ -313,16 +313,5 @@ def unified_inner_binding_with_annotation( _check(inner_binding_with_annotation, unified_inner_binding_with_annotation) -def test_lower_te(): - a = te.placeholder((32, 2, 2)) - b = te.compute((32, 2, 2), lambda i, j, k: a[i, j, k] * 2.0) - s = te.create_schedule(b.op) - s[b].bind(b.op.axis[1], te.thread_axis("threadIdx.x")) - s[b].bind(b.op.axis[2], te.thread_axis("threadIdx.x")) - orig_mod = tvm.driver.build_module.schedule_to_module(s, [a, b]) - mod = tvm.tir.transform.UnifyThreadBinding()(orig_mod) - tvm.ir.assert_structural_equal(mod, orig_mod) # UnifyThreadBinding should do nothing on TE - - if __name__ == "__main__": tvm.testing.main() diff --git a/tests/python/tir-transform/test_tir_transform_unroll_loop.py b/tests/python/tir-transform/test_tir_transform_unroll_loop.py index a05a085eeb64..37dc64a9e79c 100644 --- a/tests/python/tir-transform/test_tir_transform_unroll_loop.py +++ b/tests/python/tir-transform/test_tir_transform_unroll_loop.py @@ -94,23 +94,6 @@ def test_unroll_fake_loop(): assert isinstance(ret[0], tvm.tir.BufferStore) -def test_unroll_single_count_loops(): - n = te.size_var("n") - A = te.placeholder((n,), name="A") - B = te.compute((n,), lambda *i: A(*i), name="B") - s = te.create_schedule(B.op) - s = s.normalize() - dom_map = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, dom_map) - # all parameters to UnrolLoops are default values except for - # auto_unroll_max_extent which has been set to 1 (default:0) - mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([], stmt)) - - with tvm.transform.PassContext(config={"tir.UnrollLoop": {"auto_max_step": 1}}): - ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body - assert ret == stmt - - def test_unroll_allocations(): @tvm.script.ir_module class before: @@ -179,5 +162,4 @@ def main(B: T.Buffer((64,), "float32")): test_unroll_local_access() test_unroll_loop() test_unroll_fake_loop() - test_unroll_single_count_loops() test_unroll_allocations() diff --git a/tests/python/tir-transform/test_tir_transform_vectorize.py b/tests/python/tir-transform/test_tir_transform_vectorize.py index 9659d896aed8..c5569c829ad5 100644 --- a/tests/python/tir-transform/test_tir_transform_vectorize.py +++ b/tests/python/tir-transform/test_tir_transform_vectorize.py @@ -197,16 +197,6 @@ def main(a: T.handle, n: T.int32, x: T.int32): tvm.ir.assert_structural_equal(mod, After) -def test_vectorize_with_if_cond_int64(): - m = te.size_var("m", dtype="int64") - A = te.placeholder((m,), name="A", dtype="float32") - B = te.compute((m,), lambda i: te.if_then_else(i < 2, A[i], A[i] * 2), name="B") - s = te.create_schedule(B.op) - x, y = s[B].split(B.op.axis[0], factor=4) - s[B].vectorize(y) - f = tvm.build(s, [A, B], "llvm") - - @pytest.mark.parametrize("extent, target", [(4, simple_target), (T.vscale() * 4, sve_target)]) def test_vectorize_let(extent, target): @I.ir_module @@ -371,10 +361,9 @@ def test_ir(A, B, C): name="while_vectorize", dtype=dtype, ) - s = te.create_schedule(C.op) try: - tvm.lower(s, [A, B, C], "llvm") + tvm.build(te.create_prim_func([A, B, C]), target="llvm") assert False except tvm.error.TVMError as e: error_msg = str(e).split("\n")[-1] @@ -382,14 +371,6 @@ def test_ir(A, B, C): assert expected in error_msg -def test_vectorize_dtype_mismatch(): - n = tvm.tir.IntImm("int64", 4) - A = te.compute((n,), lambda i: tvm.tir.IntImm("int64", 2**31 - 1) + i, name="A") - s = te.create_schedule(A.op) - s[A].vectorize(A.op.axis[0]) - tvm.lower(s, [A], "llvm", simple_mode=True) - - @pytest.mark.parametrize( "extent, vec_str, target", [(16, "float32x16", simple_target), (T.vscale() * 8, "float32xvscalex8", sve_target)], @@ -815,7 +796,7 @@ def main(A: T.Buffer((25,), "float32"), B: T.Buffer((25,), "float32")): with tvm.target.Target(target): mod = tvm.tir.transform.VectorizeLoop()(Before) tvm.ir.assert_structural_equal(mod, After) - mod = tvm.build(mod, target) + mod = tvm.build(mod, target=target) @pytest.mark.parametrize( @@ -843,7 +824,7 @@ def main(A: T.Buffer((25,), "int32"), B: T.Buffer((25,), "float32")): with pytest.raises(Exception) as e_info: with tvm.target.Target(target): mod = tvm.tir.transform.VectorizeLoop()(Before) - ex = tvm.build(mod, target) + ex = tvm.build(mod, target=target) tvm.ir.assert_structural_equal(mod, After) assert "Intrinsic does not support vectors" in e_info.value.args[0] diff --git a/tests/scripts/ci.py b/tests/scripts/ci.py index 0bd97e4ee048..10d63129121f 100755 --- a/tests/scripts/ci.py +++ b/tests/scripts/ci.py @@ -683,7 +683,6 @@ def add_subparser( "run full Python tests", [ "./tests/scripts/task_python_unittest.sh", - "./tests/scripts/task_python_arm_compute_library.sh", ], ), }, diff --git a/tests/scripts/task_java_unittest.sh b/tests/scripts/task_java_unittest.sh index a35b023ad0df..2eabac31cc28 100755 --- a/tests/scripts/task_java_unittest.sh +++ b/tests/scripts/task_java_unittest.sh @@ -35,8 +35,8 @@ cleanup() } trap cleanup 0 -python3 "$SCRIPT_DIR"/test_add_cpu.py "$TEMP_DIR" -python3 "$SCRIPT_DIR"/test_add_gpu.py "$TEMP_DIR" +# python3 "$SCRIPT_DIR"/test_add_cpu.py "$TEMP_DIR" +# python3 "$SCRIPT_DIR"/test_add_gpu.py "$TEMP_DIR" # Skip the Java RPC Unittests, see https://github.com/apache/tvm/issues/13168 # # start rpc proxy server diff --git a/tests/scripts/task_web_wasm.sh b/tests/scripts/task_web_wasm.sh index 91bbbac52300..8a08c1ecb58d 100755 --- a/tests/scripts/task_web_wasm.sh +++ b/tests/scripts/task_web_wasm.sh @@ -25,9 +25,8 @@ cd web make clean npm install npm run lint -# TODO(@tqchen, @siyuan): re-enable the following tests -# npm run prepwasm -# npm run bundle -# npm run test -# npm run typedoc +npm run prepwasm +npm run bundle +npm run test +npm run typedoc cd .. diff --git a/web/tests/python/webgpu_rpc_test.py b/web/tests/python/webgpu_rpc_test.py index 6a87c1bbe556..e831afd9d3f8 100644 --- a/web/tests/python/webgpu_rpc_test.py +++ b/web/tests/python/webgpu_rpc_test.py @@ -24,7 +24,6 @@ from tvm import te from tvm import rpc from tvm.contrib import utils, tvmjs -from tvm.relay.backend import Runtime import numpy as np proxy_host = "127.0.0.1" @@ -48,7 +47,7 @@ def test_rpc(): sch.bind(i0, "blockIdx.x") sch.bind(i1, "threadIdx.x") - fadd = tvm.build(sch.mod, target=target, runtime=runtime) + fadd = tvm.build(sch.mod.with_attr("system_lib_prefix", ""), target=target) temp = utils.tempdir() wasm_path = temp.relpath("addone_gpu.wasm") diff --git a/web/tests/python/websock_rpc_test.py b/web/tests/python/websock_rpc_test.py deleted file mode 100644 index f7011cef4723..000000000000 --- a/web/tests/python/websock_rpc_test.py +++ /dev/null @@ -1,91 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -"""Simple testcode to test Javascript RPC - -To use it, start a rpc proxy with "python -m tvm.exec.rpc_proxy". -Connect javascript end to the websocket port and connect to the RPC. -""" - -import tvm -from tvm import te -from tvm import rpc -from tvm.contrib import utils, tvmjs -from tvm.relay.backend import Runtime -import numpy as np - -proxy_host = "127.0.0.1" -proxy_port = 9090 - - -def test_rpc(): - if not tvm.runtime.enabled("rpc"): - return - # generate the wasm library - runtime = Runtime("cpp", {"system-lib": True}) - target = "llvm -mtriple=wasm32-unknown-unknown-wasm" - if not tvm.runtime.enabled(target): - raise RuntimeError("Target %s is not enbaled" % target) - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") - s = te.create_schedule(B.op) - - fadd = tvm.build(s, [A, B], target, runtime=runtime, name="addone") - temp = utils.tempdir() - - wasm_path = temp.relpath("addone.wasm") - fadd.export_library(wasm_path, fcompile=tvmjs.create_tvmjs_wasm) - - wasm_binary = open(wasm_path, "rb").read() - - remote = rpc.connect( - proxy_host, - proxy_port, - key="wasm", - session_constructor_args=["rpc.WasmSession", wasm_binary], - ) - - def check(remote): - # basic function checks. - faddone = remote.get_function("testing.asyncAddOne") - fecho = remote.get_function("testing.echo") - assert faddone(100) == 101 - assert fecho(1, 2, 3) == 1 - assert fecho(1, 2, 3) == 1 - assert fecho(100, 2, 3) == 100 - assert fecho("xyz") == "xyz" - assert bytes(fecho(bytearray(b"123"))) == b"123" - # run the generated library. - f1 = remote.system_lib() - dev = remote.cpu(0) - a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) - # invoke the function - addone = f1.get_function("addone") - addone(a, b) - - # time evaluator - time_f = f1.time_evaluator("addone", dev, number=100, repeat=10) - time_f(a, b) - cost = time_f(a, b).mean - print("%g secs/op" % cost) - np.testing.assert_equal(b.numpy(), a.numpy() + 1) - - check(remote) - - -test_rpc()