From d9eb9e74d208309df9e4eb1d630a26ac8a5fe86a Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 15 Sep 2020 17:11:46 -0700
Subject: [PATCH 1/8] add tutorial

---
 docs/conf.py                                |   1 +
 python/tvm/auto_scheduler/__init__.py       |   2 +-
 python/tvm/auto_scheduler/auto_schedule.py  |  27 +++++
 tutorials/auto_scheduler/README.txt         |   2 +
 tutorials/auto_scheduler/tune_matmul_x86.py | 103 ++++++++++++++++++++
 tutorials/autotvm/README.txt                |   2 +-
 6 files changed, 135 insertions(+), 2 deletions(-)
 create mode 100644 tutorials/auto_scheduler/README.txt
 create mode 100644 tutorials/auto_scheduler/tune_matmul_x86.py

diff --git a/docs/conf.py b/docs/conf.py
index ca0bc9ba3de5..9322f5a38d12 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -193,6 +193,7 @@
         "../tutorials/language",
         "../tutorials/optimize",
         "../tutorials/autotvm",
+        "../tutorials/auto_scheduler",
         "../tutorials/dev",
         "../tutorials/topi",
         "../tutorials/deployment",
diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py
index 43e08a4aafa6..2b362872c814 100644
--- a/python/tvm/auto_scheduler/__init__.py
+++ b/python/tvm/auto_scheduler/__init__.py
@@ -26,7 +26,7 @@
 from . import feature
 
 # Shortcut
-from .auto_schedule import SearchTask, TuningOptions, HardwareParams, auto_schedule
+from .auto_schedule import SearchTask, TuningOptions, HardwareParams, create_task, auto_schedule
 from .compute_dag import ComputeDAG
 from .cost_model import RandomModel, XGBModel
 from .measure import (
diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index af257f5aa8a4..0b80b8c3d432 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -31,7 +31,10 @@
 import tvm._ffi
 from tvm.runtime import Object
 from .measure import LocalBuilder, LocalRunner
+from .workload_registry import make_workload_key, workload_key_to_tensors
+from .compute_dag import ComputeDAG
 from .search_policy import EmptyPolicy
+from .utils import get_func_name
 from . import _ffi_api
 
 
@@ -155,6 +158,30 @@ def __init__(
             measure_callbacks,
         )
 
+def create_task(func, args, target, target_host=None, hardware_params=None):
+    """Create a search task
+
+    Parameters
+    ----------
+    func : Union[Function, str]
+        The function that returns the compute declaration Tensors.
+        Can be the a function or the function name.
+    args : Args
+        The args of the function.
+    target : tvm.target.Target
+        The target device of this search task.
+    target_host : Optional[tvm.target.Target]
+        The target host device of this search task.
+    hardware_params : Optional[HardwareParams]
+        Hardware parameters used in this search task.
+
+    Returns
+    -------
+        task : the created task
+    """
+    workload_key = make_workload_key(func, args)
+    dag = ComputeDAG(workload_key)
+    return SearchTask(dag, workload_key, target, target_host, hardware_params)
 
 def auto_schedule(task, search_policy=None, tuning_options=TuningOptions()):
     """Do auto scheduling for a computation declaration.
diff --git a/tutorials/auto_scheduler/README.txt b/tutorials/auto_scheduler/README.txt
new file mode 100644
index 000000000000..52c4c14b15b0
--- /dev/null
+++ b/tutorials/auto_scheduler/README.txt
@@ -0,0 +1,2 @@
+AutoScheduler : Template-free Auto Scheduling
+-----------
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
new file mode 100644
index 000000000000..2fd30a3c54ff
--- /dev/null
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -0,0 +1,103 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Using the template-free auto-scheduler on CPU 
+=============================================
+**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
+            `Chengfan Jia <https://github.com/jcf94/>`_
+
+This is a tutorial on how to use the auto-scheduler in TVM.
+
+Different from the exiting autotvm which relies on manual templates to 
+define the search space, the auto-scheduler does not require any templates.
+The user only needs to write the computation declaration,
+the auto-scheduler then automatically generate a large
+search space and begins the search (or auto-tuning).
+
+We use matrix multiplication as an example in this tutorial.
+"""
+
+import numpy as np
+import tvm
+from tvm import te, testing, auto_scheduler
+
+######################################################################
+# To begin with, we define the computation of a matmul with bias add.
+# The function should return the list of input/output tensors.
+# From these tensors, the auto-scheduler can get the whole computational graph.
+
+@auto_scheduler.register_workload
+def matmul_add(N, L, M, dtype):
+    A = te.placeholder((N, L), name='A', dtype=dtype)
+    B = te.placeholder((L, M), name='B', dtype=dtype)
+    C = te.placeholder((N, M), name='C', dtype=dtype)
+
+    k = te.reduce_axis((0, L), name='k')
+    matmul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
+                        name='matmul')
+    D = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name='D')
+
+    return [A, B, C, D]
+
+######################################################################
+# We then create the a search task with N=L=M=128 and dtype='float32'
+
+target = tvm.target.Target("llvm")
+task = auto_scheduler.create_task(matmul_add, (128, 128, 128, 'float32'), target)
+
+print(task.compute_dag)
+
+######################################################################
+# Next, we set parameters for the auto-scheduler.
+# `num_measure_trials` is the number of measurement trials we can use during the search.
+# We only make 10 trials in this tutorial for fast demonstration. In practice, 1000 is a good value for
+# the search to converge. You can do more trials according to your time budget.
+# In addition, we use `RecordToFile` to log measurement records into a file `test.json`.
+# The measurement records can be used to query the history best, resume the search,
+# or train the cost model later.
+
+tune_option = auto_scheduler.TuningOptions(num_measure_trials=2,
+                                           measure_callbacks=[auto_scheduler.RecordToFile('test.json')])
+
+######################################################################
+# Now we get all inputs ready. Pretty simple, isn't it?
+# We can kick off the search and let the auto-scheduler do its magic.
+# After some measurement trials, it will return the best schedule it founds.
+
+sch, args = auto_scheduler.auto_schedule(task,
+                                         tuning_options=tune_option)
+
+######################################################################
+# We can lower schedule to see the IR after auto-scheduling.
+# We can also build the binary function as usual.
+
+print(tvm.lower(sch, args, simple_mode=True))
+func = tvm.build(sch, args)
+
+######################################################################
+# Finally, let use do a correctness check
+
+# check correctness
+a_np = np.random.uniform(size=(128, 128)).astype(np.float32)
+b_np = np.random.uniform(size=(128, 128)).astype(np.float32)
+c_np = np.random.uniform(size=(128, 128)).astype(np.float32)
+d_np = a_np.dot(b_np) + c_np
+
+d_tvm = tvm.nd.empty(d_np.shape)
+func(tvm.nd.array(a_np), tvm.nd.array(b_np), tvm.nd.array(c_np), d_tvm)
+
+tvm.testing.assert_allclose(d_np, d_tvm.asnumpy(), rtol=1e-2)
\ No newline at end of file
diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt
index 38e3b3343f4e..970430320400 100644
--- a/tutorials/autotvm/README.txt
+++ b/tutorials/autotvm/README.txt
@@ -1,4 +1,4 @@
 .. _tutorials-autotvm-sec:
 
-Auto tuning
+AutoTVM : Template-based Auto Tuning
 -----------

From 48aa0e9c9281fe777cd07542299fb35ef645947a Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 15 Sep 2020 18:34:13 -0700
Subject: [PATCH 2/8] add tutorial

---
 docs/api/python/auto_scheduler.rst          | 35 ++++++++
 docs/api/python/autotvm.rst                 |  2 +-
 docs/api/python/index.rst                   |  1 +
 python/tvm/auto_scheduler/auto_schedule.py  | 19 ++--
 tutorials/auto_scheduler/README.txt         |  2 +-
 tutorials/auto_scheduler/tune_matmul_x86.py | 96 ++++++++++++++-------
 tutorials/autotvm/README.txt                |  2 +-
 7 files changed, 114 insertions(+), 43 deletions(-)
 create mode 100644 docs/api/python/auto_scheduler.rst

diff --git a/docs/api/python/auto_scheduler.rst b/docs/api/python/auto_scheduler.rst
new file mode 100644
index 000000000000..85ff22f58b37
--- /dev/null
+++ b/docs/api/python/auto_scheduler.rst
@@ -0,0 +1,35 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+tvm.auto_scheduler
+------------------
+.. automodule:: tvm.auto_scheduler
+
+tvm.auto_scheduler.auto_schedule
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. automodule:: tvm.auto_scheduler.auto_schedule
+
+.. autoclass:: tvm.auto_scheduler.auto_schedule.SearchTask
+
+.. autoclass:: tvm.auto_scheduler.auto_schedule.TuningOptions
+
+.. autofunction:: tvm.auto_scheduler.auto_schedule.create_task
+
+.. autofunction:: tvm.auto_scheduler.auto_schedule.auto_schedule
+
+
+
diff --git a/docs/api/python/autotvm.rst b/docs/api/python/autotvm.rst
index 9357d1b6be08..5bde9ac47962 100644
--- a/docs/api/python/autotvm.rst
+++ b/docs/api/python/autotvm.rst
@@ -18,7 +18,7 @@
 tvm.autotvm
 -----------
 .. automodule:: tvm.autotvm
-.. automodule:: tvm.autotvm.apply_history_best
+.. autofunction:: tvm.autotvm.apply_history_best
 
 tvm.autotvm.measure
 ~~~~~~~~~~~~~~~~~~~
diff --git a/docs/api/python/index.rst b/docs/api/python/index.rst
index bc9ec5fd8304..a6179684413d 100644
--- a/docs/api/python/index.rst
+++ b/docs/api/python/index.rst
@@ -40,6 +40,7 @@ Python API
    relay/dataflow_pattern
    relay/testing
    autotvm
+   auto_scheduler
    rpc
    micro
    contrib
diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index 0b80b8c3d432..e4ea1ec80313 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -31,10 +31,10 @@
 import tvm._ffi
 from tvm.runtime import Object
 from .measure import LocalBuilder, LocalRunner
-from .workload_registry import make_workload_key, workload_key_to_tensors
+from .workload_registry import make_workload_key
 from .compute_dag import ComputeDAG
-from .search_policy import EmptyPolicy
-from .utils import get_func_name
+from .cost_model import XGBModel
+from .search_policy import SketchPolicy
 from . import _ffi_api
 
 
@@ -158,6 +158,7 @@ def __init__(
             measure_callbacks,
         )
 
+
 def create_task(func, args, target, target_host=None, hardware_params=None):
     """Create a search task
 
@@ -183,16 +184,16 @@ def create_task(func, args, target, target_host=None, hardware_params=None):
     dag = ComputeDAG(workload_key)
     return SearchTask(dag, workload_key, target, target_host, hardware_params)
 
+
 def auto_schedule(task, search_policy=None, tuning_options=TuningOptions()):
-    """Do auto scheduling for a computation declaration.
+    """Run auto scheduling search for a task
 
     Parameters
     ----------
     task : SearchTask
         The SearchTask for the computation declaration.
     search_policy : Optional[SearchPolicy]
-        The search policy to be used for schedule search. Use EmptyPolicy as default, which always
-        returns an empty schedule.
+        The search policy to be used for schedule search.
     tuning_options : Optional[TuningOptions]
         Tuning and measurement options.
 
@@ -205,5 +206,9 @@ def auto_schedule(task, search_policy=None, tuning_options=TuningOptions()):
             "Invalid task: " + task + " . `auto_scheduler.auto_schedule` expects a SearchTask."
         )
 
-    sch, tensors = _ffi_api.AutoSchedule(search_policy or EmptyPolicy(task), tuning_options)
+    if search_policy is None:
+        cost_model = XGBModel()
+        search_policy = SketchPolicy(task, cost_model)
+
+    sch, tensors = _ffi_api.AutoSchedule(search_policy, tuning_options)
     return sch, tensors
diff --git a/tutorials/auto_scheduler/README.txt b/tutorials/auto_scheduler/README.txt
index 52c4c14b15b0..75986679f0bd 100644
--- a/tutorials/auto_scheduler/README.txt
+++ b/tutorials/auto_scheduler/README.txt
@@ -1,2 +1,2 @@
 AutoScheduler : Template-free Auto Scheduling
------------
+---------------------------------------------
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 2fd30a3c54ff..fefdbf830e1b 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -20,13 +20,12 @@
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
-This is a tutorial on how to use the auto-scheduler in TVM.
-
-Different from the exiting autotvm which relies on manual templates to 
-define the search space, the auto-scheduler does not require any templates.
-The user only needs to write the computation declaration,
-the auto-scheduler then automatically generate a large
-search space and begins the search (or auto-tuning).
+Different from the exiting :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
+manual templates to define the search space, the auto-scheduler does not require any templates.
+The auto-scheduler is template-free, so users only need to write the computation declaration without
+any schedule commands or templates.
+The auto-scheduler can automatically generate a large
+search space and find a good schedule in the space.
 
 We use matrix multiplication as an example in this tutorial.
 """
@@ -36,62 +35,72 @@
 from tvm import te, testing, auto_scheduler
 
 ######################################################################
+# Define the computation
+# ^^^^^^^^^^^^^^^^^^^^^^
 # To begin with, we define the computation of a matmul with bias add.
 # The function should return the list of input/output tensors.
 # From these tensors, the auto-scheduler can get the whole computational graph.
 
+
 @auto_scheduler.register_workload
 def matmul_add(N, L, M, dtype):
-    A = te.placeholder((N, L), name='A', dtype=dtype)
-    B = te.placeholder((L, M), name='B', dtype=dtype)
-    C = te.placeholder((N, M), name='C', dtype=dtype)
+    A = te.placeholder((N, L), name="A", dtype=dtype)
+    B = te.placeholder((L, M), name="B", dtype=dtype)
+    C = te.placeholder((N, M), name="C", dtype=dtype)
 
-    k = te.reduce_axis((0, L), name='k')
-    matmul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
-                        name='matmul')
-    D = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name='D')
+    k = te.reduce_axis((0, L), name="k")
+    matmul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="matmul")
+    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="D")
+
+    return [A, B, C, out]
 
-    return [A, B, C, D]
 
 ######################################################################
-# We then create the a search task with N=L=M=128 and dtype='float32'
+# Create the search task
+# ^^^^^^^^^^^^^^^^^^^^^^
+# We then create the a search task with N=L=M=128 and dtype="float32"
 
 target = tvm.target.Target("llvm")
-task = auto_scheduler.create_task(matmul_add, (128, 128, 128, 'float32'), target)
+task = auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target)
 
+# inspect the computational graph
 print(task.compute_dag)
 
 ######################################################################
 # Next, we set parameters for the auto-scheduler.
-# `num_measure_trials` is the number of measurement trials we can use during the search.
-# We only make 10 trials in this tutorial for fast demonstration. In practice, 1000 is a good value for
-# the search to converge. You can do more trials according to your time budget.
-# In addition, we use `RecordToFile` to log measurement records into a file `test.json`.
-# The measurement records can be used to query the history best, resume the search,
-# or train the cost model later.
-
-tune_option = auto_scheduler.TuningOptions(num_measure_trials=2,
-                                           measure_callbacks=[auto_scheduler.RecordToFile('test.json')])
+#
+# * `num_measure_trials` is the number of measurement trials we can use during the search.
+#   We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a
+#   good value for the search to converge. You can do more trials according to your time budget.
+# * In addition, we use `RecordToFile` to dump measurement records into a file `matmul.json`.
+#   The measurement records can be used to query the history best, resume the search,
+#   or do more analysis later.
+# * see :any:`auto_schedule.TuningOptions`: for more parameters
+
+tune_option = auto_scheduler.TuningOptions(
+    num_measure_trials=10, measure_callbacks=[auto_scheduler.RecordToFile("matmul.json")]
+)
 
 ######################################################################
+# Run the search
+# ^^^^^^^^^^^^^^
 # Now we get all inputs ready. Pretty simple, isn't it?
 # We can kick off the search and let the auto-scheduler do its magic.
 # After some measurement trials, it will return the best schedule it founds.
 
-sch, args = auto_scheduler.auto_schedule(task,
-                                         tuning_options=tune_option)
+sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)
 
 ######################################################################
 # We can lower schedule to see the IR after auto-scheduling.
-# We can also build the binary function as usual.
 
 print(tvm.lower(sch, args, simple_mode=True))
-func = tvm.build(sch, args)
 
 ######################################################################
-# Finally, let use do a correctness check
+# Check correctness
+# ^^^^^^^^^^^^^^^^^
+# We build the binary and check its correctness
 
-# check correctness
+func = tvm.build(sch, args)
 a_np = np.random.uniform(size=(128, 128)).astype(np.float32)
 b_np = np.random.uniform(size=(128, 128)).astype(np.float32)
 c_np = np.random.uniform(size=(128, 128)).astype(np.float32)
@@ -100,4 +109,25 @@ def matmul_add(N, L, M, dtype):
 d_tvm = tvm.nd.empty(d_np.shape)
 func(tvm.nd.array(a_np), tvm.nd.array(b_np), tvm.nd.array(c_np), d_tvm)
 
-tvm.testing.assert_allclose(d_np, d_tvm.asnumpy(), rtol=1e-2)
\ No newline at end of file
+tvm.testing.assert_allclose(d_np, d_tvm.asnumpy(), rtol=1e-3)
+
+######################################################################
+# Using the record file
+# ^^^^^^^^^^^^^^^^^^^^^
+# During the search, all measuremnt records is dumpped into the record
+# file "matmul.json". The measurement records can be used to resume the
+# search, re-apply search results and other analysis.
+#
+# Here we show an example where we load the best schedule from a file,
+# print the equivalent python schedule API, and build the binary again.
+
+inp, res = auto_scheduler.load_best("matmul.json", task.workload_key)
+
+# Print equivalent python schedule API. This can be used for debugging and
+# learning the behavior of auto-scheduler.
+print(task.compute_dag.print_python_code_from_state(inp.state))
+
+# Rebuild the binary. This shows how you can apply the best schedule from a
+# log file without reruning the search again.
+sch, args = task.compute_dag.apply_steps_from_state(inp.state)
+func = tvm.build(sch, args)
diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt
index 970430320400..a1d33ba088cc 100644
--- a/tutorials/autotvm/README.txt
+++ b/tutorials/autotvm/README.txt
@@ -1,4 +1,4 @@
 .. _tutorials-autotvm-sec:
 
 AutoTVM : Template-based Auto Tuning
------------
+------------------------------------

From a943a32b3113a8b0e5bba8119e7ca3df65b7dfca Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 15 Sep 2020 18:39:39 -0700
Subject: [PATCH 3/8] update

---
 tutorials/auto_scheduler/tune_matmul_x86.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index fefdbf830e1b..ded10cb33357 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -50,7 +50,7 @@ def matmul_add(N, L, M, dtype):
 
     k = te.reduce_axis((0, L), name="k")
     matmul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="matmul")
-    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="D")
+    out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out")
 
     return [A, B, C, out]
 
@@ -63,7 +63,7 @@ def matmul_add(N, L, M, dtype):
 target = tvm.target.Target("llvm")
 task = auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target)
 
-# inspect the computational graph
+# Inspect the computational graph
 print(task.compute_dag)
 
 ######################################################################
@@ -92,6 +92,8 @@ def matmul_add(N, L, M, dtype):
 
 ######################################################################
 # We can lower schedule to see the IR after auto-scheduling.
+# The auto-scheduler correctly performs optimizations including multi-level tiling,
+# parallelization, vectorization, unrolling and fusion.
 
 print(tvm.lower(sch, args, simple_mode=True))
 
@@ -114,13 +116,14 @@ def matmul_add(N, L, M, dtype):
 ######################################################################
 # Using the record file
 # ^^^^^^^^^^^^^^^^^^^^^
-# During the search, all measuremnt records is dumpped into the record
+# During the search, all measuremnt records are dumpped into the record
 # file "matmul.json". The measurement records can be used to resume the
 # search, re-apply search results and other analysis.
 #
 # Here we show an example where we load the best schedule from a file,
 # print the equivalent python schedule API, and build the binary again.
 
+# Load the measuremnt record for the best schedule
 inp, res = auto_scheduler.load_best("matmul.json", task.workload_key)
 
 # Print equivalent python schedule API. This can be used for debugging and

From 6dd4fd89f75983ae9fae66940867fb23e68e2adf Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 15 Sep 2020 20:50:52 -0700
Subject: [PATCH 4/8] Apply suggestions from code review

Co-authored-by: Cody Yu <comaniac0422@gmail.com>
---
 python/tvm/auto_scheduler/auto_schedule.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index e4ea1ec80313..b58ac31cc1fe 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -167,7 +167,7 @@ def create_task(func, args, target, target_host=None, hardware_params=None):
     func : Union[Function, str]
         The function that returns the compute declaration Tensors.
         Can be the a function or the function name.
-    args : Args
+    args : Union[Tuple[Any, ...], List[Any]]
         The args of the function.
     target : tvm.target.Target
         The target device of this search task.
@@ -178,7 +178,7 @@ def create_task(func, args, target, target_host=None, hardware_params=None):
 
     Returns
     -------
-        task : the created task
+        SearchTask: the created task
     """
     workload_key = make_workload_key(func, args)
     dag = ComputeDAG(workload_key)

From 77725e7c411c2b780b4df2ec039b72e9514499bb Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 15 Sep 2020 21:16:03 -0700
Subject: [PATCH 5/8] address comments

---
 tutorials/auto_scheduler/tune_matmul_x86.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index ded10cb33357..1c1ba47c47b6 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -58,7 +58,7 @@ def matmul_add(N, L, M, dtype):
 ######################################################################
 # Create the search task
 # ^^^^^^^^^^^^^^^^^^^^^^
-# We then create the a search task with N=L=M=128 and dtype="float32"
+# We then create a search task with N=L=M=128 and dtype="float32"
 
 target = tvm.target.Target("llvm")
 task = auto_scheduler.create_task(matmul_add, (128, 128, 128, "float32"), target)
@@ -86,12 +86,12 @@ def matmul_add(N, L, M, dtype):
 # ^^^^^^^^^^^^^^
 # Now we get all inputs ready. Pretty simple, isn't it?
 # We can kick off the search and let the auto-scheduler do its magic.
-# After some measurement trials, it will return the best schedule it founds.
+# After some measurement trials, it will return the best schedule it found.
 
 sch, args = auto_scheduler.auto_schedule(task, tuning_options=tune_option)
 
 ######################################################################
-# We can lower schedule to see the IR after auto-scheduling.
+# We can lower the schedule to see the IR after auto-scheduling.
 # The auto-scheduler correctly performs optimizations including multi-level tiling,
 # parallelization, vectorization, unrolling and fusion.
 
@@ -118,7 +118,7 @@ def matmul_add(N, L, M, dtype):
 # ^^^^^^^^^^^^^^^^^^^^^
 # During the search, all measuremnt records are dumpped into the record
 # file "matmul.json". The measurement records can be used to resume the
-# search, re-apply search results and other analysis.
+# search, re-apply search results and perform other analyses.
 #
 # Here we show an example where we load the best schedule from a file,
 # print the equivalent python schedule API, and build the binary again.
@@ -127,7 +127,7 @@ def matmul_add(N, L, M, dtype):
 inp, res = auto_scheduler.load_best("matmul.json", task.workload_key)
 
 # Print equivalent python schedule API. This can be used for debugging and
-# learning the behavior of auto-scheduler.
+# learning the behavior of the auto-scheduler.
 print(task.compute_dag.print_python_code_from_state(inp.state))
 
 # Rebuild the binary. This shows how you can apply the best schedule from a

From 9d418f89536109157f8f6d38100f8f57ed724689 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 16 Sep 2020 09:28:12 -0700
Subject: [PATCH 6/8] fix bugs

---
 python/tvm/auto_scheduler/auto_schedule.py    | 30 +++++++++----------
 .../search_policy/sketch_policy_rules.cc      |  4 +--
 tutorials/auto_scheduler/tune_matmul_x86.py   |  2 +-
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index b58ac31cc1fe..c580eb5e600d 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -92,27 +92,27 @@ class TuningOptions(Object):
     Parameters
     ----------
     num_measure_trials: int = 0
-      The number of measurement trials.
-      The search policy measures `num_measure_trials` schedules in total and returns the best one
-      among them.
-      With `num_measure_trials` == 0, the policy will do the schedule search but won't involve
-      measurement. This can be used to get a runnable schedule quickly without auto-tuning.
+        The number of measurement trials.
+        The search policy measures `num_measure_trials` schedules in total and returns the best one
+        among them.
+        With `num_measure_trials` == 0, the policy will do the schedule search but won't involve
+        measurement. This can be used to get a runnable schedule quickly without auto-tuning.
     early_stopping: Optional[int]
-      Stop the tuning early if getting no improvement after n measurements.
+        Stop the tuning early if getting no improvement after n measurements.
     num_measures_per_round: int = 64
-      The number of schedules to be measured at each search round.
-      The whole schedule search process will try a total number of `num_measure_trials` in several
-      rounds.
+        The number of schedules to be measured at each search round.
+        The whole schedule search process will try a total number of `num_measure_trials` in several
+        rounds.
     verbose: int = 1
-      Verbosity level. 0 for silent, 1 to output information during schedule search.
+        Verbosity level. 0 for silent, 1 to output information during schedule search.
     builder: Union[ProgramBuilder, str] = 'local'
-      ProgramBuilder which builds the program.
+        ProgramBuilder which builds the program.
     runner: Union[ProgramRunner, str] = 'local'
-      ProgramRunner which runs the program and measures time costs.
+        ProgramRunner which runs the program and measures time costs.
     measure_callbacks: Optional[List[MeasureCallback]]
-      Callback functions called after each measurement.
-      Candidates:
-        - auto_scheduler.RecordToFile
+        Callback functions called after each measurement.
+        Candidates:
+          - auto_scheduler.RecordToFile
     """
 
     def __init__(
diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
index 843301c2bb8f..38a72a7734b4 100644
--- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc
+++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc
@@ -593,7 +593,7 @@ PopulationGenerationRule::ResultKind MutateComputeLocationCommon(SketchPolicyNod
 
 PopulationGenerationRule::ResultKind InitChangeComputeLocation::Apply(SketchPolicyNode* policy,
                                                                       State* state) const {
-  return MutateComputeLocationCommon(policy, state, false);
+  return MutateComputeLocationCommon(policy, state, true);
 }
 
 PopulationGenerationRule::ResultKind InitParallel::Apply(SketchPolicyNode* policy,
@@ -1059,7 +1059,7 @@ PopulationGenerationRule::ResultKind MutateMaxUnrollFactor::Apply(SketchPolicyNo
 
 PopulationGenerationRule::ResultKind MutateComputeLocation::Apply(SketchPolicyNode* policy,
                                                                   State* state) const {
-  return MutateComputeLocationCommon(policy, state, true);
+  return MutateComputeLocationCommon(policy, state, false);
 }
 
 PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* policy,
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 1c1ba47c47b6..1a3e06f9dd06 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -20,7 +20,7 @@
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
 
-Different from the exiting :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
+Different from the existing :ref:`autotvm <tutorials-autotvm-sec>` which relies on 
 manual templates to define the search space, the auto-scheduler does not require any templates.
 The auto-scheduler is template-free, so users only need to write the computation declaration without
 any schedule commands or templates.

From 5cad061d9074ce5e74a28933b3f9425daabdcbaa Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 16 Sep 2020 10:19:01 -0700
Subject: [PATCH 7/8] add the exmple for resuming the search

---
 python/tvm/auto_scheduler/auto_schedule.py  |  2 +-
 tutorials/auto_scheduler/tune_matmul_x86.py | 49 ++++++++++++++++++---
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index c580eb5e600d..80510d355d21 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -112,7 +112,7 @@ class TuningOptions(Object):
     measure_callbacks: Optional[List[MeasureCallback]]
         Callback functions called after each measurement.
         Candidates:
-          - auto_scheduler.RecordToFile
+            - auto_scheduler.RecordToFile
     """
 
     def __init__(
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 1a3e06f9dd06..59e28a84c2f2 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Using the template-free auto-scheduler on CPU 
+Auto-scheduling a Subgraph for CPU
 =============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_
@@ -74,7 +74,7 @@ def matmul_add(N, L, M, dtype):
 #   good value for the search to converge. You can do more trials according to your time budget.
 # * In addition, we use `RecordToFile` to dump measurement records into a file `matmul.json`.
 #   The measurement records can be used to query the history best, resume the search,
-#   or do more analysis later.
+#   and do more analyses later.
 # * see :any:`auto_schedule.TuningOptions`: for more parameters
 
 tune_option = auto_scheduler.TuningOptions(
@@ -117,10 +117,11 @@ def matmul_add(N, L, M, dtype):
 # Using the record file
 # ^^^^^^^^^^^^^^^^^^^^^
 # During the search, all measuremnt records are dumpped into the record
-# file "matmul.json". The measurement records can be used to resume the
-# search, re-apply search results and perform other analyses.
-#
-# Here we show an example where we load the best schedule from a file,
+# file "matmul.json". The measurement records can be used to re-apply search results,
+# resume the search, and perform other analyses.
+
+######################################################################
+# Here is an example where we load the best schedule from a file,
 # print the equivalent python schedule API, and build the binary again.
 
 # Load the measuremnt record for the best schedule
@@ -134,3 +135,39 @@ def matmul_add(N, L, M, dtype):
 # log file without reruning the search again.
 sch, args = task.compute_dag.apply_steps_from_state(inp.state)
 func = tvm.build(sch, args)
+
+######################################################################
+# A more complicated example is to resume the search.
+# In this case, we need to create the search policy and cost model by ourselves
+# and resume the status of search policy and cost model with the log file.
+# In the example below we resume the status and do more 5 trials.
+
+
+def resume_search(task, log_file):
+    cost_model = auto_scheduler.XGBModel()
+    cost_model.update_from_file(log_file)
+    search_policy = auto_scheduler.SketchPolicy(
+        task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)]
+    )
+    tune_option = auto_scheduler.TuningOptions(
+        num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)]
+    )
+    sch, args = auto_scheduler.auto_schedule(task, search_policy, tuning_options=tune_option)
+
+
+# resume_search(task, "matmul.json")
+
+######################################################################
+# .. note::
+#   We cannot run the line above because of the conflict between
+#   python's multiprocessing and tvm's thread pool.
+#   After running a tvm generated binary (L112), the python's multiprocessing
+#   library will hang forever.
+#   You have to make sure that you don't run any tvm generated binaries before
+#   calling ansor's search. To run the L156 above, you should comment out L112-114.
+#
+#   You should be careful about this problem in your applications.
+#   There are other workarounds for this problem.
+#   For example, you can start a new thread/process (with the builtin python library
+#   threading or multiprocessing) and run the tvm binaries in the new thread/process.
+#   This provides an isolation and avoids the conflict in the main thread/process.

From 28bccefa2cbe0a5237d101c0c6fb26f460042ddc Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 16 Sep 2020 13:22:47 -0700
Subject: [PATCH 8/8] fix lint

---
 python/tvm/auto_scheduler/auto_schedule.py  | 2 +-
 tutorials/auto_scheduler/tune_matmul_x86.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/auto_scheduler/auto_schedule.py b/python/tvm/auto_scheduler/auto_schedule.py
index 80510d355d21..eae8b2527b6e 100644
--- a/python/tvm/auto_scheduler/auto_schedule.py
+++ b/python/tvm/auto_scheduler/auto_schedule.py
@@ -112,7 +112,7 @@ class TuningOptions(Object):
     measure_callbacks: Optional[List[MeasureCallback]]
         Callback functions called after each measurement.
         Candidates:
-            - auto_scheduler.RecordToFile
+        - auto_scheduler.RecordToFile
     """
 
     def __init__(
diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/auto_scheduler/tune_matmul_x86.py
index 59e28a84c2f2..1a9af42510eb 100644
--- a/tutorials/auto_scheduler/tune_matmul_x86.py
+++ b/tutorials/auto_scheduler/tune_matmul_x86.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """
-Auto-scheduling a Subgraph for CPU
+Auto-scheduling matrix multiplication for CPU
 =============================================
 **Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
             `Chengfan Jia <https://github.com/jcf94/>`_