diff --git a/docs/conf.py b/docs/conf.py index 189877da8f5d..cffb616f3b22 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -409,17 +409,6 @@ def jupyter_notebook(script_blocks, gallery_conf, target_dir, real_func): from sphinx_gallery.sorting import ExplicitOrder examples_dirs = [ - # legacy tutorial structure under gallery folder - tvm_path.joinpath("gallery", "tutorial"), - tvm_path.joinpath("gallery", "how_to", "compile_models"), - tvm_path.joinpath("gallery", "how_to", "deploy_models"), - tvm_path.joinpath("gallery", "how_to", "work_with_relay"), - tvm_path.joinpath("gallery", "how_to", "work_with_schedules"), - tvm_path.joinpath("gallery", "how_to", "optimize_operators"), - tvm_path.joinpath("gallery", "how_to", "tune_with_autotvm"), - tvm_path.joinpath("gallery", "how_to", "tune_with_autoscheduler"), - tvm_path.joinpath("gallery", "how_to", "extend_tvm"), - # New tutorial structure under docs folder tvm_path.joinpath("docs", "get_started", "tutorials"), tvm_path.joinpath("docs", "how_to", "tutorials"), tvm_path.joinpath("docs", "deep_dive", "relax", "tutorials"), @@ -427,17 +416,6 @@ def jupyter_notebook(script_blocks, gallery_conf, target_dir, real_func): ] gallery_dirs = [ - # legacy tutorial structure under gallery folder - "tutorial", - "how_to/compile_models", - "how_to/deploy_models", - "how_to/work_with_relay", - "how_to/work_with_schedules", - "how_to/optimize_operators", - "how_to/tune_with_autotvm", - "how_to/tune_with_autoscheduler", - "how_to/extend_tvm", - # New tutorial structure under docs folder "get_started/tutorials/", "how_to/tutorials/", "deep_dive/relax/tutorials/", @@ -448,68 +426,7 @@ def jupyter_notebook(script_blocks, gallery_conf, target_dir, real_func): # The listed files are sorted according to the list. # The unlisted files are sorted by filenames. # The unlisted files always appear after listed files. -within_subsection_order = { - "tutorial": [ - "introduction.py", - "install.py", - "tvmc_command_line_driver.py", - "tvmc_python.py", - "autotvm_relay_x86.py", - "tensor_expr_get_started.py", - "autotvm_matmul_x86.py", - "auto_scheduler_matmul_x86.py", - "tensor_ir_blitz_course.py", - "topi.pi", - "cross_compilation_and_rpc.py", - "relay_quick_start.py", - "uma.py", - ], - "compile_models": [ - "from_pytorch.py", - "from_tensorflow.py", - "from_mxnet.py", - "from_onnx.py", - "from_keras.py", - "from_tflite.py", - "from_coreml.py", - "from_darknet.py", - "from_caffe2.py", - "from_paddle.py", - ], - "work_with_schedules": [ - "schedule_primitives.py", - "reduction.py", - "scan.py", - "extern_op.py", - "tensorize.py", - "tuple_inputs.py", - "tedd.py", - ], - "optimize_operators": [ - "opt_gemm.py", - "opt_conv_cuda.py", - "opt_conv_tensorcore.py", - ], - "tune_with_autotvm": [ - "tune_conv2d_cuda.py", - "tune_relay_cuda.py", - "tune_relay_x86.py", - "tune_relay_arm.py", - "tune_relay_mobile_gpu.py", - ], - "tune_with_autoscheduler": [ - "tune_matmul_x86.py", - "tune_conv2d_layer_cuda.py", - "tune_network_x86.py", - "tune_network_cuda.py", - ], - "extend_tvm": [ - "low_level_custom_pass.py", - "use_pass_infra.py", - "use_pass_instrument.py", - "bring_your_own_datatypes.py", - ], -} +within_subsection_order = {} class WithinSubsectionOrder: diff --git a/docs/dev/how_to/debugging_tvm.rst b/docs/dev/how_to/debugging_tvm.rst deleted file mode 100644 index 1e3c9fb39363..000000000000 --- a/docs/dev/how_to/debugging_tvm.rst +++ /dev/null @@ -1,72 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _debugging-tvm: - -Debugging TVM -============== - -**NOTE**: This page is a work in-progress. Everyone is welcomed to add suggestions and tips via -sending a PR to modify this page. The goal with this page is to centralize the commonly-used -techniques being used to debug TVM and to spread awareness to the community. To that end, we may -seek to promote more broadly-used techniques to the top of this doc. - -VLOGging --------- - -TVM provides a verbose-logging facility that allows you to commit trace-level debugging messages -without impacting the binary size or runtime of TVM in production. You can use VLOG in your code -as follows: - -.. code-block:: c++ - - void Foo(const std::string& bar) { - VLOG(2) << "Running Foo(" << bar << ")"; - // ... - } - -In this example, the integer ``2`` passed to ``VLOG()`` indicates a verbosity level. The higher the -level, the more logs printed. In general, TVM levels range from 0 to 2, with 3 being used only for -extremely low-level core runtime properties. The VLOG system is configured at startup time to print -VLOG statements between ``0`` and some integer ``N``. ``N`` can be set per-file or globally. - -VLOGs don't print or impact binary size or runtime by default (when compiled with proper -optimization). To enable VLOGging, do the following: - -1. In ``config/cmake``, ensure you ``set(USE_RELAY_DEBUG ON)``. This flag is used to enable - VLOGging. -2. Launch Python passing ``TVM_LOG_DEBUG=``, where ```` is a comma-separated list of - level assignments of the form ``=``. Here are some specializations: - - - The special filename ``DEFAULT`` sets the VLOG level setting for all files. - - ```` can be set to ``-1`` to disable VLOG in that file. - - ```` is the name of the c++ source file (e.g. ``.cc``, not ``.h``) relative to the - ``src/`` directory in the TVM repo. You do not need to supply ``src/`` when specifying the - file path, but if you do, VLOG will still interpret the path correctly. - -Examples: - -.. code-block:: shell - - # enable VLOG(0), VLOG(1), VLOG(2) in all files. - $ TVM_LOG_DEBUG=DEFAULT=2 python3 -c 'import tvm' - - # enable VLOG(0), VLOG(1), VLOG(2) in all files, except not VLOG(2) in src/bar/baz.cc. - $ TVM_LOG_DEBUG=DEFAULT=2,bar/baz.cc=1 python3 -c 'import tvm' - - # enable VLOG(0), VLOG(1), VLOG(2) in all files, except not in src/foo/bar.cc. - $ TVM_LOG_DEBUG=DEFAULT=2,src/foo/bar.cc=-1 python3 -c 'import tvm' diff --git a/docs/dev/how_to/how_to.rst b/docs/dev/how_to/how_to.rst deleted file mode 100644 index aa89324fb949..000000000000 --- a/docs/dev/how_to/how_to.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _dev-how-to: - -Developer How-To Guide -====================== -This section contains a collection of tips about how to work on -various areas of the TVM stack. - -.. toctree:: - :maxdepth: 1 - - debugging_tvm - relay_add_op - relay_add_pass - relay_bring_your_own_codegen diff --git a/docs/dev/how_to/relay_add_op.rst b/docs/dev/how_to/relay_add_op.rst deleted file mode 100644 index 39c6a2c480ca..000000000000 --- a/docs/dev/how_to/relay_add_op.rst +++ /dev/null @@ -1,495 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _relay-add-op: - -Adding an Operator to Relay -=========================== - -In this document we will go over the steps needed to register a new TVM operator -in Relay. We will be following this PR which adds a `cumulative product`_ operation as an example. -The PR itself builds upon another PR which adds a `cumulative sum`_ operation. - -.. _cumulative product: https://github.com/apache/tvm/pull/7722 -.. _cumulative sum: https://github.com/apache/tvm/pull/7334 - -Registering a new operator requires a few steps: - -1. Add an attribute node declaring fixed arguments which are known at compile time -2. Write a type relation for your operation to integrate into Relay's type system. -3. Use the ``RELAY_REGISTER_OP`` macro in C++ to register the operator's arity, type, and other hints for the compiler -4. Write how the operator is computed -5. Register the compute, schedule with the relay operator -6. Define a C++ function to produce a call node for the operator and registering a Python API hook for the function -7. Wrapping the above Python API hook in a neater interface -8. Writing tests for the new relay operator - -1. Defining an Attribute Node ------------------------------ -Attributes are fixed arguments which are supposed to be known at compile time. The stride and dilation of a convolution -operator would be an appropriate example of fields which might belong in an attribute node for a convolution operator. - -Attributes should be defined in a file within the folder `include/tvm/relay/attrs/`_. - -.. _include/tvm/relay/attrs/: https://github.com/apache/tvm/tree/main/include/tvm/relay/attrs - -Ultimately we want to create an operator whose interface can be seen clearly in the final python interface: - -.. code:: python - - def cumprod(data, axis=None, dtype=None, exclusive=None): - """Numpy style cumprod op. Return the cumulative inclusive product of the elements along - a given axis. - Parameters - ---------- - data : relay.Expr - The input data to the operator. - axis : int, optional - Axis along which the cumulative product is computed. The default (None) is to compute - the cumprod over the flattened array. - dtype : string, optional - Type of the returned array and of the accumulator in which the elements are multiplied. - If dtype is not specified, it defaults to the dtype of data. - exclusive : bool, optional - If true will return exclusive product in which the first element is not - included. In other terms, if true, the j-th output element would be - the product of the first (j-1) elements. Otherwise, it would be the product of - the first j elements. The product of zero elements will be 1. - Returns - ------- - result : relay.Expr - The result has the same size as data, and the same shape as data if axis is not None. - If axis is None, the result is a 1-d array. - """ - -A similiar interface exists for ``cumsum()``. - -Therefore, when defining our attributes in ``include/tvm/relay/attrs/transform.h`` we choose the axis, -accumulation dtype, and exclusivity of the operation as appropriate fields for the struct. - -.. code:: c++ - - /*! \brief Attributes used in cumsum and cumprod operator */ - struct ScanopAttrs : public tvm::AttrsNode { - Integer axis; - DataType dtype; - Bool exclusive = Bool(false); - TVM_DECLARE_ATTRS(ScanopAttrs, "relay.attrs.ScanopAttrs") { - TVM_ATTR_FIELD(axis).describe("The axis to operate over").set_default(NullValue()); - TVM_ATTR_FIELD(dtype).describe("Output data type").set_default(NullValue()); - TVM_ATTR_FIELD(exclusive) - .describe("The first element is not included") - .set_default(Bool(false)); - } - }; - -2. Writing a Type Relation --------------------------- -To allow for flexibility in registering operators and greater -expressivity and granularity in expressing types in Relay, operators -are typed using relations between input and output types. These relations -are represented as functions that take in a list of input types and -output types (any of these types may be incomplete) and return a list -of input and output types that satisfies the relation. This includes shape -information which can be determined statically at compile time. Essentially, a -relation for an operator can enforce all the necessary typing rules -(namely by inspecting the input types) in addition to computing the -output type. - -Type relation for the cumulative product and sum operators can be found in -``src/relay/op/tensor/transform.cc``: - -.. code:: c++ - - TVM_REGISTER_NODE_TYPE(ScanopAttrs); - bool ScanopRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { - // types: [data, output] - ICHECK_EQ(types.size(), 2) << "Expects two types, one for the input and another for the output"; - const auto* data = types[0].as(); - if (data == nullptr) { - ICHECK(types[0].as()) - << "Scanop: expect input type to be TensorType but get " << types[0]; - return false; - } - - const auto* param = attrs.as(); - - auto dtype = param->dtype; - if (dtype.is_void()) { - dtype = data->dtype; - } - - if (param->axis.defined()) { - reporter->Assign(types[1], TensorType(data->shape, dtype)); - } else { - auto prod = data->shape[0]; - for (size_t i = 1; i < data->shape.size(); ++i) { - prod = prod * data->shape[i]; - } - reporter->Assign(types[1], TensorType({prod}, dtype)); - } - - return true; - } - -3. Relating the Arity and Attributes to an Operation ----------------------------------------------------- - -We then register the name of our new ops and annotate them with the calling interface. -The ``RELAY_REGISTER_OP`` macro in C++ allows a developer -to specify the following information about an operator in Relay: - -- Arity (number of arguments) -- Names and descriptions for positional arguments -- Support level (1 indicates an internal intrinsic; higher numbers indicate less integral or externally supported operators) -- A type relation for the operator -- Other annotations useful when optimizing the operation. - -Once again we add this to ``src/relay/op/tensor/transform.cc``: - -.. code:: c++ - - RELAY_REGISTER_OP("cumsum") - .describe( - R"doc(Return the cumulative sum of the elements along a given axis.)doc" TVM_ADD_FILELINE) - .set_num_inputs(1) - .add_argument("data", "Tensor", "The input tensor.") - .set_support_level(3) - .add_type_rel("Cumsum", ScanopRel) - .set_attr("TOpPattern", kOpaque); - - RELAY_REGISTER_OP("cumprod") - .describe( - R"doc(Return the cumulative product of the elements along a given axis.)doc" TVM_ADD_FILELINE) - .set_num_inputs(1) - .add_argument("data", "Tensor", "The input tensor.") - .set_support_level(3) - .add_type_rel("Cumprod", ScanopRel) - .set_attr("TOpPattern", kOpaque); - -In this case the ``TOpPattern`` is a hint to the compiler on the pattern of computation the operator does, which might be -useful for fusing operators. ``kOpaque`` tells TVM to not bother trying to fuse this operator. - -4. Defining the Compute of the Operation ----------------------------------------- - -While we've now defined the interface for our operations we still need to define -how to perform the actual calculations for cumulative sum and product. - -Writing this code is outside the scope of the tutorial. For now, we assume we -have a well tested implementation for the operation's compute. For more details -on how to do this, we recommend looking up the tutorials on :ref:`tensor -expressions `, :ref:`TVM's operator inventory -(topi) ` and looking at the example cumulative sum and product -implementations found in `python/tvm/topi/scan.py`_ and the gpu versions in -`python/tvm/topi/cuda/scan.py`_. - -.. _python/tvm/topi/scan.py: https://github.com/apache/tvm/blob/main/python/tvm/topi/scan.py -.. _python/tvm/topi/cuda/scan.py: https://github.com/apache/tvm/blob/main/python/tvm/topi/cuda/scan.py - -1. Hooking up Compute and Strategy with Relay ---------------------------------------------- - -After you have implemented your compute function we now need to glue it to our -relay operation. Within TVM this means not only defining the computation, but also the schedule -for an operation. A strategy is a method which picks which computation and which schedule -to use. For example, for 2D convolutions we might recognize we are doing a depthwise convolution -and dispatch to a more efficient computation and schedule as a result. In our case however we have -no such need except for dispatching between our CPU and GPU implementations. In -``python/tvm/relay/op/strategy/generic.py`` and ``python/tvm/relay/op/strategy/cuda.py`` we -add the following strategies: - -.. code:: python - - def wrap_compute_scanop(topi_compute): - """Wrap scanop style topi compute""" - - def _compute_scanop(attrs, inputs, _): - return [topi_compute(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)] - - return _compute_scanop - - - @override_native_generic_func("cumsum_strategy") - def cumsum_strategy(attrs, inputs, out_type, target): - """cumsum generic strategy""" - strategy = _op.OpStrategy() - strategy.add_implementation( - wrap_compute_scanop(topi.cumsum), - wrap_topi_schedule(topi.generic.schedule_extern), - name="cumsum.generic", - ) - return strategy - - - @override_native_generic_func("cumprod_strategy") - def cumprod_strategy(attrs, inputs, out_type, target): - """cumprod generic strategy""" - strategy = _op.OpStrategy() - strategy.add_implementation( - wrap_compute_scanop(topi.cumprod), - wrap_topi_schedule(topi.generic.schedule_extern), - name="cumprod.generic", - ) - return strategy - - @cumsum_strategy.register(["cuda", "gpu"]) - def cumsum_strategy_cuda(attrs, inputs, out_type, target): - """cumsum cuda strategy""" - strategy = _op.OpStrategy() - strategy.add_implementation( - wrap_compute_scanop(topi.cuda.cumsum), - wrap_topi_schedule(topi.cuda.schedule_scan), - name="cumsum.cuda", - ) - return strategy - - - @cumprod_strategy.register(["cuda", "gpu"]) - def cumprod_strategy_cuda(attrs, inputs, out_type, target): - """cumprod cuda strategy""" - strategy = _op.OpStrategy() - strategy.add_implementation( - wrap_compute_scanop(topi.cuda.cumprod), - wrap_topi_schedule(topi.cuda.schedule_scan), - name="cumprod.cuda", - ) - return strategy - -Where in each strategy we define the compute we wrote and the schedule to use within ``add_implementation()``. -We finally link the strategy and compute with the defined relay operator in ``python/tvm/relay/op/_transform.py``: - -.. code:: python - - # cumsum - @_reg.register_compute("cumsum") - def compute_cumsum(attrs, inputs, output_type): - """Compute definition of cumsum""" - return [topi.cumsum(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)] - - - _reg.register_strategy("cumsum", strategy.cumsum_strategy) - _reg.register_shape_func("cumsum", False, elemwise_shape_func) - - # cumprod - @_reg.register_compute("cumprod") - def compute_cumprod(attrs, inputs, output_type): - """Compute definition of cumprod""" - return [topi.cumprod(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)] - - - _reg.register_strategy("cumprod", strategy.cumprod_strategy) - _reg.register_shape_func("cumprod", False, elemwise_shape_func) - -The shape functions are used for determining output shape given a dynamically shaped tensor. In this -case we tell TVM the output shape will be the same as the input shape. - -6. Creating a Relay Call Node and Exposing a Python Hook --------------------------------------------------------- -We now have a working operation and now just need to properly call it -via a Relay Call Node. This step requires simply writing a function that takes -the arguments to the operator (as Relay expressions) and -returning a call node to the operator (i.e., the node that -should be placed into the Relay AST where the call to the -operator is intended). - -At present call attributes and type arguments (the last two fields) -are not supported, so it suffices to use ``Op::Get`` to fetch -the operator's information from the operator registry and pass in -the arguments to the call node, as below. In ``src/relay/op/tensor/transform.cc``: - -.. code:: c++ - - Expr MakeCumsum(Expr data, Integer axis, DataType dtype, Bool exclusive) { - auto attrs = make_object(); - attrs->dtype = dtype; - attrs->axis = axis; - attrs->exclusive = exclusive; - static const Op& op = Op::Get("cumsum"); - return Call(op, {data}, Attrs(attrs), {}); - } - - TVM_REGISTER_GLOBAL("relay.op._make.cumsum").set_body_typed(MakeCumsum); - - Expr MakeCumprod(Expr data, Integer axis, DataType dtype, Bool exclusive) { - auto attrs = make_object(); - attrs->dtype = dtype; - attrs->axis = axis; - attrs->exclusive = exclusive; - static const Op& op = Op::Get("cumprod"); - return Call(op, {data}, Attrs(attrs), {}); - } - - TVM_REGISTER_GLOBAL("relay.op._make.cumprod").set_body_typed(MakeCumprod); - -Where ``TVM_REGISTER_GLOBAL`` exposes the ``MakeCumsum`` and ``MakeCumprod`` functions -in Python via ``relay.op._make.cumsum(...)`` and ``relay.op._make.cumprod(...)``. - -7. Including a Cleaner Python API Hook --------------------------------------- - -It is generally the convention in Relay, that functions exported -through ``TVM_REGISTER_GLOBAL`` should be wrapped in a separate -Python function rather than called directly in Python. For our -operators we expose this cleaner interface in ``python/tvm/relay/op/transform.py`` - -.. code:: python - - def cumsum(data, axis=None, dtype=None, exclusive=None): - return _make.cumsum(data, axis, dtype, exclusive) - - def cumprod(data, axis=None, dtype=None, exclusive=None): - return _make.cumprod(data, axis, dtype, exclusive) - -Note that these Python wrappers might also be good opportunities to -provide an easier interface to the operator. For example, the -``concat`` operator is registered as taking only one operator, -namely a tuple with the tensors to be concatenated, but the Python -wrapper takes the tensors as arguments and combines them into a tuple -before producing the call node: - -.. code:: python - - def concat(*args): - """Concatenate the input tensors along the zero axis. - - Parameters - ---------- - args: list of Tensor - - Returns - ------- - tensor: The concatenated tensor. - """ - tup = Tuple(list(args)) - return _make.concat(tup) - -8. Writing Unit Tests! ----------------------- -This is self explanatory! Some example unit tests can be found in -`tests/python/relay/test_op_level3.py`_ for our cumulative sum -and product operators. - -.. _tests/python/relay/test_op_level3.py: https://github.com/apache/tvm/blob/main/tests/python/relay/test_op_level3.py - - -Other Topics ------------- - -Gradient Operators -~~~~~~~~~~~~~~~~~~ - -Gradient operators are important for writing differentiable programs in -Relay. While it is the case that Relay's autodiff algorithm can differentiate -first-class language constructs, operators are opaque. Because Relay can't -look into the implementation, an explicit differentiation rule must be -provided. - -Both Python and C++ can be used to write gradient operators, but we focus our -examples on Python, as it is more commonly used. - -Adding a Gradient in Python -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A collection of Python gradient operators can be found in -``python/tvm/relay/op/_tensor_grad.py``. We will walk through two -representative examples: ``sigmoid`` and ``multiply``. - -.. code:: python - - @register_gradient("sigmoid") - def sigmoid_grad(orig, grad): - """Returns [grad * sigmoid(x) * (1 - sigmoid(x))].""" - return [grad * orig * (ones_like(orig) - orig)] - -The inputs here are the original operator ``orig`` and a gradient ``grad`` to -accumulate into. What we return is a list, where the element at the i'th -index is the derivative of the operator with respect to the operator's i'th -input. In general, the gradient will return a list with as many elements as -there are inputs to the base operator. - -Before we further analyze this definition, first we should recall the -derivative of the sigmoid function: :math:`\frac{\partial \sigma}{\partial x} -= \sigma(x)(1 - \sigma(x))`. The definition above looks similar to the -mathematical definition, but there is one important addition, which we -describe below. - -The term ``orig * (ones_like(orig) - orig)`` directly matches the derivative, -because ``orig`` here is the sigmoid function, but we're not just interested -in how to compute the gradient of this function. We're interested in -composing this gradient with other gradients, so we can accumulate the -gradient across an entire program. This is where the ``grad`` term comes in. -In the expression ``grad * orig * (ones_like(orig) - orig)``, multiplying by -``grad`` specifies how to compose the derivative with the gradient thus far. - -Now, we consider ``multiply``, a slightly more interesting example: - -.. code:: python - - @register_gradient("multiply") - def multiply_grad(orig, grad): - """Returns [grad * y, grad * x]""" - x, y = orig.args - return [collapse_sum_like(grad * y, x), - collapse_sum_like(grad * x, y)] - -In this example, there are two elements in the returned list, because -``multiply`` is a binary operator. And to recall, if :math:`f(x, y) = xy`, the -partial derivatives are :math:`\frac{\partial f}{\partial x} = y` and -:math:`\frac{\partial f}{\partial y} = x`. - -There is one required step for ``multiply`` that is not required for -``sigmoid``, because ``multiply`` has broadcasting semantics. Since the shape -of ``grad`` might not match the shape of the inputs, we use -``collapse_sum_like`` to take the contents of the ``grad * `` terms and -make the shape match the shape of the input we're differentiating with -respect to. - -Adding a Gradient in C++ -~~~~~~~~~~~~~~~~~~~~~~~~ - -Adding a gradient in C++ is similar to adding one in Python, but the -interface for registering is slightly different. - -First, make sure ``src/relay/transforms/pattern_utils.h`` is included. It provides -helper functions for creating nodes in the Relay AST. Then, define the -gradient in a similar fashion as in the Python example: - -.. code:: c - - tvm::Array MultiplyGrad(const Expr& orig_call, const Expr& output_grad) { - const Call& call = orig_call.Downcast(); - return { CollapseSumLike(Multiply(output_grad, call.args[1]), call.args[0]), - CollapseSumLike(Multiply(output_grad, call.args[0]), call.args[1]) }; - } - -Notice that in C++ we can't use the same operator overloading that we have in -Python, and we need to downcast, so the implementation is more verbose. Even -so, we can easily verify that this definition mirrors the earlier example in -Python. - -Now, instead of using a Python decorator, we need to tack a ``set_attr`` call -for "FPrimalGradient" onto the end of the base operator's registration, in -order to register the gradient. - -.. code:: c - - RELAY_REGISTER_OP("multiply") - // ... - // Set other attributes - // ... - .set_attr("FPrimalGradient", MultiplyGrad); diff --git a/docs/dev/how_to/relay_add_pass.rst b/docs/dev/how_to/relay_add_pass.rst deleted file mode 100644 index 90211d0c98a5..000000000000 --- a/docs/dev/how_to/relay_add_pass.rst +++ /dev/null @@ -1,406 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _relay-add-pass: - -Adding a Compiler Pass to Relay -=============================== - -Compiler passes are the primary interface for both extending Relay's feature -set and for performing optimizations on Relay programs. By writing a compiler -pass, you can modify the AST or collect information about the AST, -depending on your goal. Indeed, some of Relay's most important built-in -features (e.g., autodiff and type inference) are nothing more than "standard" -compiler passes. - -At a high level, there are two key components to writing a pass: - -- Creating one or more C++ classes that traverse the program -- Wrapping the traversal implementation and its metadata in the pass manager API so it can neatly interface with the :ref:`pass-infra` - -To begin, we'll give an overview of the key mechanisms for writing a compiler -pass. Then, we'll walk through a concrete example of the constant-folding -pass in Relay. - -AST Traversers --------------- - -The base class used to traverse Relay programs is ``ExprFunctor``. The public -interface it provides is a ``VisitExpr`` method that takes an expression and -zero or more arguments and returns an instance of some type. When you extend -this class, you define the AST traversal pattern by overriding -implementations of ``VisitExpr_`` for each type of expression. - -The relation between ``VisitExpr`` and ``VisitExpr_`` has to do with -dispatch. Each ``VisitExpr_`` definition targets a specific type of -expression, but you don't always know which node type you'll be visiting. -To remedy this, ``ExprFunctor`` provides a ``VisitExpr`` function which -routes from the given expression to the ``VisitExpr_`` case that handles it. -Although C++ already provides dynamic dispatch, ``ExprFunctor`` defines its -own vtable, which ``VisitExpr`` uses. By defining our own vtable, we have -more control over dispatch. For example, if we wanted to define a -``PrintVisitor`` traverser that printed "Here" before every visit, we -could override ``VisitExpr``: - -.. code:: c - - void PrintVisitor::VisitExpr(const Expr& expr) { - std::cout << "Here" << std::endl; - ExprFunctor::VisitExpr(expr); - } - -``ExprFunctor`` itself is a very general class, which is why more often than -not, you will be extending ``ExprVisitor`` or ``ExprMutator``. These classes -extend ``ExprFunctor`` and provide default implementations of ``VisitExpr_`` -that capture common traversal patterns for each expression type. Having these -default implementations means we only need to provide overriding -implementations for the expression types where we want different behavior. We -describe each subclass on its own in the following sections. - -Expression Visitors -~~~~~~~~~~~~~~~~~~~ - -``ExprVisitor`` is for passes that don't modify the program and instead -perform program analyses and collect information. With this class, -``VisitExpr`` and the private counterparts return nothing. The ``VisitExpr_`` -implementations provided by this class simply visit all of the expression's -fields that are expressions. The default implementation for ``IfNode`` is -shown below. - -.. code:: c - - void ExprVisitor::VisitExpr_(const IfNode* op) { - this->VisitExpr(op->cond); - this->VisitExpr(op->true_branch); - this->VisitExpr(op->false_branch); - } - -Note that we're calling ``VisitExpr`` and not ``VisitExpr_`` here, so we can -use the vtable in ``ExprFunctor`` for routing. - -Now, if we wanted to write a class ``CallChecker`` that checks if any -function calls appear in the program, we would only need to extend -``ExprVisitor`` and define the following ``VisitExpr_`` method: - -.. code:: c - - void VisitExpr_(const CallNode* n) final { - result_ = true; - } - -where ``result_`` is a field. In this case, we don't need to further recurse -on the fields of the ``CallNode``, because ``result_`` is already true and we -now know the original expression contains a call. To make this visitor -usable, we would provide the following public method: - -.. code:: c - - bool Check(const Expr& expr) final { - result_ = false; - VisitExpr(expr); - return result_; - } - -And that's all we need. It is very common to define a public interface that -performs some bookkeeping before invoking the top-level recursion. We could -of course further wrap the API by making a standalone procedure that creates -a ``CallChecker`` instance and calls ``Check`` on it, but the takeaway is -that we've achieved our goal with very little effort. - -Expression Mutators -~~~~~~~~~~~~~~~~~~~ - -``ExprMutator`` is for passes that transform the program in some way. With -this class, ``VisitExpr`` and its private counterparts return ``Expr``. The -default ``VisitExpr_`` implementations provided by this class visit all of -the expression's fields that are expressions and set the fields to be the -result of visiting them. The default implementation for ``TupleGetItemNode`` -is shown below. - -.. code:: c - - Expr ExprMutator::VisitExpr_(const TupleGetItemNode* g) { - auto t = this->Mutate(g->tuple); - if (g->tuple == t) { - return GetRef(g); - } else { - return TupleGetItem(t, g->index); - } - } - -There are a few things to notice here. First, ``Mutate`` is an alias for -``VisitExpr`` in ``ExprMutator``. Second, we only return a new node if the -call to ``Mutate`` modified the ``tuple`` field. This method of update is -called a functional update and doing so avoids unnecessary allocations. - -One feature ``ExprMutator`` has that ``ExprVisitor`` doesn't is a built-in -``memo_`` field for caching results. It makes sense that ``ExprMutator`` has -a memoizer, because we know which types of results we're caching (i.e., -``Expr``), whereas the visit methods of ``ExprVisitor`` don't return -anything. Usually, when we want to cache results in a subclass of -``ExprVisitor``, we need to define the cache ourselves. - -Now, if we wanted to write a class ``IfCollapser`` that replaces every if -statement with its true branch, we would override ``VisitExpr_`` for -``IfNode``: - -.. code:: c - - Expr ExprMutator::VisitExpr_(const IfNode* op) { - return this->Mutate(op->true_branch); - } - -Note that the returned expression will not necessarily be an ``IfNode``, and -this is fine, because the return type is ``Expr``. Now, we create the public -interface: - -.. code:: c - - Expr CollapseIfs(const Expr& expr) final { - return this->Mutate(expr); - } - -With this mutator, we didn't need to do any bookkeeping, but we still want to -follow the convention of having a descriptive method as the interface. - -Example: Constant Folding -------------------------- - -In order to better understand the process of writing a pass, we will look at -the constant folding pass (found in `src/relay/transforms/fold_constant.cc`_) -as a guide, because it is a relatively simple pass that incorporates -both types of traversals. - -Constant folding involves evaluating expressions in the program that only -involve constant values, then replacing those expressions with the result -of evaluating them. The goal of this pass is to frontload all of the -computations that we can. To achieve this, the constant folding pass makes -use of a visitor (``ConstantChecker``) and a mutator (``ConstantFolder``). - -The ``ConstantChecker`` Visitor -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This visitor is used to check if an expression is constant. In Relay, we -define an expression to be constant if it is a ``ConstantNode`` or it is a -``TupleNode`` with only constant fields. - -We use a ``memo_`` field to map from nodes to whether they are constant and -to cache these results. Below are the ``VisitExpr_`` definitions in the -``ConstantChecker``. - -.. code:: c - - void VisitExpr_(const ConstantNode* n) final { - memo_[GetRef(n)] = true; - } - - void VisitExpr_(const TupleNode* n) final { - bool result = true; - for (const auto& field : n->fields) { - if (!Check(field)) { - result = false; - break; - } - } - memo_[GetRef(n)] = result; - } - -The bookkeeping used to coordinate these definitions is a ``Check`` method -that returns whether the given expression is considered constant. - -.. code:: c - - bool Check(const Expr& expr) { - const auto it = memo_.find(expr); - if (it != memo_.end()) - return it->second; - VisitExpr(expr); - return memo_[expr]; - } - -We don't modify ``memo_`` for every node we encounter; instead we only modify -``memo_`` when the encountered node could potentially be constant. Then we -rely on the default value being false when ``memo_`` doesn't contain -``expr``. - -The ``ConstantFolder`` Mutator -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This mutator performs the bulk of the constant folding pass and internally -uses ``ConstantChecker``. In Relay, there are three node types that are -involved in constant folding: ``LetNode``, ``TupleItemGetNode``, and -``CallNode``. In the following paragraphs, we explain the roles of each in -the pass. - -.. code:: c - - Expr VisitExpr_(const LetNode* op) final { - Expr value = this->Mutate(op->value); - if (value.as()) { - memo_[op->var] = value; - return this->Mutate(op->body); - } else { - Var var = Downcast(this->Mutate(op->var)); - Expr body = this->Mutate(op->body); - if (var.same_as(op->var) && - value.same_as(op->value) && - body.same_as(op->body)) { - return GetRef(op); - } else { - return Let(var, value, body); - } - } - } - -In the ``LetNode`` case, we first attempt to const-fold the value being bound -in the expression. If we can, then we populate ``memo_`` and return the -result of visiting the body---essentially, propagating the bound value to its -use sites in the body. If we can't const-fold the bound value, we mimic the -default implementation. - -.. code:: c - - Expr VisitExpr_(const TupleGetItemNode* op) final { - Expr res = ExprMutator::VisitExpr_(op); - op = res.as(); - if (const auto* tuple = op->tuple.as()) { - return tuple->fields[op->index]; - } else { - return res; - } - } - -In the ``TupleItemGetNode`` case, we check if ``op->tuple`` field is a -``TupleNode``. If so, we replace the tuple get with the field of the tuple -pointed to by ``op->index``. The reason we need to check is because -``op->tuple`` might evaluate to a tuple, without itself being a tuple. - -.. code:: c - - Expr VisitExpr_(const CallNode* call) final { - static auto op_stateful = Op::GetAttrMap("TOpIsStateful"); - Expr res = ExprMutator::VisitExpr_(call); - call = res.as(); - // We don't constant fold function with zero arguments. - // This is a heuristic that is useful. - // For example it is harmful to fold ones(shape=(4, 5)). - if (call->args.size() == 0) return res; - const OpNode* op = call->op.as(); - if (op == nullptr) return res; - // skip stateful ops. - if (op_stateful.get(GetRef(op), false)) return res; - bool all_const_args = true; - for (Expr arg : call->args) { - if (!checker_.Check(arg)) { - all_const_args = false; - } - } - if (all_const_args) { - return ConstEvaluate(res); - } else { - return res; - } - } - -In the ``CallNode`` case, we first use the ``VisitExpr_`` of ``ExprMutator`` -to visit the call, which const-folds all of the fields of the call. We use -``ExprMutator::VisitExpr_`` instead of ``VisitExpr``, because we want to -bypass the vtable (to avoid an infinite loop) and use the default -implementation provided by ``ExprMutator``. Then we evaluate the call only if -all of the arguments are constant (using ``ConstantChecker``). Evaluating the -call produces a **value**, so we use a helper method ``ValueToExpr`` to allow -us to place the evaluated expression back into the AST. - -Now, we construct a more convenient interface ``FoldConstant`` for our constant -folder. ``FoldConstant`` is a standalone function outside of the ``ConstantFolder`` -class that takes an expression and internally creates and uses a -``ConstantFolder`` instance (the full definition can be found in -`src/relay/transforms/fold_constant.cc`_). - - -Registering a Pass with the Pass Manager -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -*Note: please see the documentation on the :ref:`pass-infra` for more specific detail on this subject.* - -With the AST traversers written, the pass can be registered to become a TVM -API endpoint with the following code: - -.. code:: c - - namespace transform { - - Pass FoldConstant() { - runtime::TypedPackedFunc pass_func = - [=](Function f, Module m, PassContext pc) { - return Downcast(FoldConstant(f)); - }; - return CreateFunctionPass(pass_func, 2, "FoldConstant", {}); - } - - } // namespace transform - -If the ``Pass`` object produced by the above code is given to the pass infrastructure, -it will ensure that the AST traversal is applied to every function in the -given Relay module, which is the behavior one would expect for a constant folding -pass (it should fold all constants where possible). - -The function ``CreateFunctionPass`` -allows for registering the optimization level of the pass (in this case, 2), which can -be used to group together passes based on their general utility, a name for the pass, -and any dependencies for the pass. A pass's dependencies are given as a list of any passes -whose results are necessary to be able to run the current pass. ``FoldConstant`` does not -have any dependencies, but many Relay passes do depend on having type information, -so ``InferType`` is a common dependency; others may depend on the program's being in -A-normal form, via the ``ToANormalForm`` pass. - -Note that the ``PassContext`` object contains information a pass uses for -error reporting and configuration options; ``FoldConstant`` does not need -this information but other passes may reference their ``PassContext`` objects. - -The pass can now be invoked via the pass infrastructure, though it's a good idea to -also add a Python binding for the pass, as in this code snippet: - -.. code:: c - - TVM_REGISTER_GLOBAL("relay._transform.FoldConstant") - .set_body_typed(FoldConstant); - -Once ``Pass`` objects are defined in the above fashion, they can be invoked using the -pass infrastructure's ``Sequential`` construct, which takes a list of passes and applies -them in sequence to a Relay module, obtaining a transformed module as a result. For example, -the below code applies both the ``FoldConstant`` and ``ToANormalForm`` passes -(one after the other) to each function in ``mod`` and obtains a new module. - -.. code:: python - - seq = transform.Sequential([ - relay.transform.FoldConstant(), - relay.transform.ToANormalForm() - ]) - new_mod = seq(mod) - -More detail about registration can be found in :ref:`tvm-runtime-system` and more -information about the pass manager interface can be found in :ref:`pass-infra`. -Relay's standard passes are listed in `include/tvm/relay/transform.h`_ and implemented -in `src/relay/transforms/`_. - -.. _include/tvm/relay/transform.h: https://github.com/apache/tvm/blob/main/include/tvm/relay/transform.h - -.. _src/relay/transforms/: https://github.com/apache/tvm/tree/main/src/relay/transforms - -.. _src/relay/transforms/fold_constant.cc: https://github.com/apache/tvm/blob/main/src/relay/transforms/fold_constant.cc diff --git a/docs/dev/how_to/relay_bring_your_own_codegen.rst b/docs/dev/how_to/relay_bring_your_own_codegen.rst deleted file mode 100644 index c106bb2a6372..000000000000 --- a/docs/dev/how_to/relay_bring_your_own_codegen.rst +++ /dev/null @@ -1,960 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _relay-bring-your-own-codegen: - -============================= -Bring Your Own Codegen To TVM -============================= - -As the number of hardware devices targeted by deep learning workloads keeps increasing, the required knowledge for users to achieve high performance on various devices keeps increasing as well. To free data scientists from worrying about the performance when developing a new model, hardware backend providers either provide libraries such as DNNL(Intel OneDNN) or cuDNN with many commonly used deep learning operators, or provide frameworks such as TensorRT to let users describe their models in a certain way to achieve high performance. However, users have to learn a new programming interface when they attempt to work on a new library or device. As a result, the demand for a unified programming interface becomes more and more important to 1) let all users and hardware backend providers stand on the same page, and 2) provide a feasible solution to allow specialized hardware or library to only support widely used operators with extremely high performance, but fallback unsupported operators to general devices like CPU/GPU. - -In this developer guide, we demonstrate how you, as a hardware backend provider, can easily implement your own codegen and register it as a Relay backend compiler to support your hardware device/library. This guide covers two types of codegen based on different graph representations you need: - -**1. You want to generate C code.** - -If your hardware already has a well-optimized C/C++ library, such as Intel CBLAS/MKL to CPU and NVIDIA CUBLAS to GPU, then this is what you are looking for. Fortunately, C source code module is fully compatible with TVM runtime module, which means the generated code could be compiled by any C/C++ compiler with proper compilation flags, so the only task you have is to implement a codegen that generates C code for subgraphs and a C source module to integrate into TVM runtime module. We will demonstrate how to implement a C code generator for your hardware in the following section. - -**2. You want to generate any other graph representations.** - -Your hardware may require other forms of graph representation, such as JSON. In this case, you need to implement not only a codegen but also a customized TVM runtime module to let TVM runtime know how this graph representation should be executed. If you already have a complete graph execution engine for your hardware, such as TensorRT for GPU, then this is a solution you can consider. - -After you finish the codegen and runtime, you can then let your customers annotate their models with your customized tag to make use of them. The tutorial for end-users to annotate and launch a specific codegen is **here (TBA)**. - -********************* -Implement a C Codegen -********************* - -In this part, we demonstrate how to implement a codegen that generates C code with pre-implemented operator functions. To simplify, our example codegen does not depend on third-party libraries. Instead, we manually implement two macros in C: - -.. code-block:: c++ - - #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_) \ - extern "C" void p_ID_(float* a, float* b, float* out) { \ - for (int64_t i = 0; i < p_DIM1_; ++i) { \ - out[i] = a[i] p_OP_ b[i]; \ - } \ - } - - #define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_) \ - extern "C" void p_ID_(float* a, float* b, float* out) { \ - for (int64_t i = 0; i < p_DIM1_; ++i) { \ - for (int64_t j = 0; j < p_DIM2_; ++j) { \ - int64_t k = i * p_DIM2_ + j; \ - out[k] = a[k] p_OP_ b[k]; \ - } \ - } \ - } - -With the two macros, we can generate binary operators for 1-D and 2-D tensors. For example, given a subgraph as follows. Assuming all inputs are 2-D tensors with shape (10, 10). - -:: - - c_compiler_input0 - | - add <-- c_compiler_input1 - | - subtract <-- c_compiler_input2 - | - multiply <-- c_compiler_input3 - | - out - -Our goal is to generate the following compilable code to execute the subgraph: - -.. code-block:: c++ - - #include - #include - #include - #include - #include - #include - - #define GCC_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_) \ - extern "C" void p_ID_(float* a, float* b, float* out) { \ - for (int64_t i = 0; i < p_DIM1_; ++i) { \ - out[i] = a[i] p_OP_ b[i]; \ - } \ - } - - #define GCC_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_) \ - extern "C" void p_ID_(float* a, float* b, float* out) { \ - for (int64_t i = 0; i < p_DIM1_; ++i) { \ - for (int64_t j = 0; j < p_DIM2_; ++j) { \ - int64_t k = i * p_DIM2_ + j; \ - out[k] = a[k] p_OP_ b[k]; \ - } \ - } \ - } - - // Note 1 - GCC_BINARY_OP_2D(gcc_0_0, *, 10, 10); - GCC_BINARY_OP_2D(gcc_0_1, -, 10, 10); - GCC_BINARY_OP_2D(gcc_0_2, +, 10, 10); - - // Note 2 - extern "C" void gcc_0_(float* gcc_input0, float* gcc_input1, - float* gcc_input2, float* gcc_input3, float* out) { - float* buf_0 = (float*)malloc(4 * 100); - float* buf_1 = (float*)malloc(4 * 100); - gcc_0_2(gcc_input0, gcc_input1, buf_0); - gcc_0_1(buf_0, gcc_input2, buf_1); - gcc_0_0(buf_1, gcc_input3, out); - free(buf_0); - free(buf_1); - } - - // Note 3 - extern "C" int gcc_0_wrapper(DLTensor* arg0, DLTensor* arg1, DLTensor* arg2, - DLTensor* arg3, DLTensor* out) { - gcc_0_(static_cast(arg0->data), static_cast(arg1->data), - static_cast(arg2->data), static_cast(arg3->data), - static_cast(out->data)); - return 0; - } - TVM_DLL_EXPORT_TYPED_FUNC(gcc_0, gcc_0_wrapper); - -Here we highlight the notes marked in the above code: - -* **Note 1** is the function implementation for the three nodes in the subgraph. - -* **Note 2** is a function to execute the subgraph by allocating intermediate buffers and invoking corresponding functions. - -* **Note 3** is a TVM runtime compatible wrapper function. It accepts a list of input tensors and one output tensor (the last argument), casts them to the right data type, and invokes the subgraph function described in Note 2. In addition, ``TVM_DLL_EXPORT_TYPED_FUNC`` is a TVM macro that generates another function ``gcc_0`` with unified the function arguments by packing all tensors to ``TVMArgs``. As a result, the TVM runtime can directly invoke ``gcc_0`` to execute the subgraph without additional efforts. With the above code generated, TVM is able to compile it along with the rest parts of the graph and export a single library for deployment. - -In the rest of this section, we will implement a codegen step-by-step to generate the above code. Your own codegen has to be located at ``src/relay/backend/contrib//``. In our example, we name our codegen "codegen_c" and put it under `/src/relay/backend/contrib/codegen_c/ `_. Feel free to check this file for a complete implementation. - -Specifically, we are going to implement two classes in this file and here is their relationship: - -:: - - subgraph subgraph - TVM backend -----------------------------> CSourceCodegen -------------> CodegenC - ^ | ^ | - | | | | - ---------------------------------------- ------------------------ - generated C source runtime module generated C code - -When TVM backend finds a function (subgraph) in a Relay graph is annotated with the registered compiler tag (``ccompiler`` in this example), TVM backend invokes ``CSourceCodegen`` and passes the subgraph. ``CSourceCodegen``'s member function ``CreateCSourceModule`` will 1) generate C code for the subgraph, and 2) wrap the generated C code to a C source runtime module for TVM backend to compile and deploy. In particular, the C code generation is transparent to the ``CodegenC`` class because it provides many useful utilities to ease the code generation implementation. The following sections will implement these two classes in the bottom-up order. - -Implement CodegenC -================== - -In ``src/relay/backend/contrib/codegen_c/codegen.cc``, we first create a codegen class skeleton under the namespace of ``tvm.relay.contrib``: - -.. code-block:: c++ - - #include - #include - #include - #include - #include - - #include - #include - - #include "codegen_c.h" - - namespace tvm { - namespace relay { - namespace contrib { - - class CodegenC : public ExprVisitor, public CodegenCBase { - public: - explicit CodegenC(const std::string& id) { this->ext_func_id_ = id; } - - void VisitExpr_(const VarNode* node) { ; } - void VisitExpr_(const CallNode* call) final { ; } - std::string JIT() { ; } - - private: - /*! \brief The function id that represents a C source function. */ - std::string ext_func_id_ = ""; - /*! \brief The index of a wrapped C function. */ - int func_idx = 0; - /*! \brief The index of allocated buffers. */ - int buf_idx_ = 0; - /*! \brief The arguments of a C compiler compatible function. */ - std::vector ext_func_args_; - /*! \brief The statements of a C compiler compatible function. */ - std::vector ext_func_body; - /*! \brief The declaration statements of a C compiler compatible function. */ - std::vector func_decl_; - /*! \brief The declaration statements of buffers. */ - std::vector buf_decl_; - /*! \brief The name and index pairs for output. */ - std::vector> out_; - } - -The ``CodegenC`` class inherits two classes: ``ExprVisitor`` provides abilities to traverse subgraphs and collects the required information and generate subgraph functions such as ``gcc_0_``; ``CodegenCBase`` provides abilities and utilities to generate wrapper functions such as ``gcc_0`` in the above example. As can be seen, we only need to implement three functions in this codegen class to make it work. - -Code Generation for Operators ------------------------------ - -We first implement ``VisitExpr_(const CallNode* call)``. This function visits all call nodes when traversing the subgraph. Each call node contains an operator that we want to offload to your hardware. As a result, we need to generate the corresponding C code with correct operators in topological order. We implement this function step-by-step as follows. - -**1. Generate the function declaration** - -Example Result: ``GCC_BINARY_OP_2D(gcc_0_0, *, 10, 10);`` - -To generate the function declaration, as shown above, we need 1) a function name (e.g., ``gcc_0_0``), 2) the type of operator (e.g., ``*``), and 3) the input tensor shape (e.g., ``(10, 10)``). Fortunately, this information can be obtained easily from ``CallNode``: - -.. code-block:: c++ - - std::ostringstream macro_stream; - std::ostringstream decl_stream; - std::ostringstream buf_stream; - - // Generate a unique function name you like. - std::string func_name = ext_func_id_ + "_" + std::to_string(func_idx++); - - // Make function declaration string. - macro_stream << "CSOURCE_BINARY_OP_" << call->args.size() << "D(" << func_name << ", "; - - // Check the operator type. - if (IsOp(call, "add")) { - macro_stream << "+"; - } else if (IsOp(call, "subtract")) { - macro_stream << "-"; - } else if (IsOp(call, "multiply")) { - macro_stream << "*"; - } else { - LOG(FATAL) << "Unrecognized op"; - } - - // Extract the input tensor shape. - auto in_shape = GetShape(call->args[0]->checked_type()); - for (size_t i = 0; i < in_shape.size(); ++i) { - macro_stream << ", " << in_shape[i]; - } - macro_stream << ");"; - func_decl_.push_back(macro_stream.str()); - -As can be seen, we push the generated code to class member variables ``func_decl_``. It means after we finish traversing the entire subgraph, we have collected all required function declarations and the only thing we need to do is having them compiled by GCC. The rest implementation of ``VisitExpr_(const CallNode* call)`` also follow this concept. - -**2. Generate the function call** - -Example Result: ``gcc_0_0(buf_1, gcc_input3, out);`` - -After generating the function declaration, we need to generate a function call with proper inputs and outputs. To know which inputs or buffers we should put when calling this function, we have to visit its arguments: - -.. code-block:: c++ - - bool first = true; - decl_stream << func_name << "("; - for (size_t i = 0; i < call->args.size(); ++i) { - VisitExpr(call->args[i]); // Note 1 - for (auto out : out_) { - if (!first) { - decl_stream << ", "; - } - first = false; - decl_stream << out.first; - } - } - // Note 2 - -Again, we want to highlight the notes in the above code: - -**Note 1**: ``VisitExpr(call->args[i])`` is a recursive call to visit arguments of the current function. An argument could be an output of another node or an input tensor. In our example implementation, we make sure every node updates a class variable ``out_`` before leaving the visitor. Here is an illustration: - -:: - - arg_node arg_node <- Visit arg (Note 1) arg_node - | | | - curr_node <- Process curr_node curr_node <- Put "buf_0" as an input buffer - - (a) out_ = {} (b) out_ = {} (c) out_ = {("buf_0", 20)} - - -We can see in the above figure, class variable ``out_`` is empty before visiting the argument node, and it was filled with the output buffer name and size of ``arg_node``. As a result, when we finished visiting the argument node, we know the proper input buffer we should put by looking at ``out_``. You will find out how we update ``out_`` at the end of this section as well as the next section. - -**Note 2**: You may notice that we did not close the function call string in this step. The current function call string looks like: ``gcc_0_0(buf_1, gcc_input3``. This is because we have not put the last argument (i.e., the output) to this call. The output of a function call could be either an allocated temporary buffer or the subgraph output tensor. For simplify, in this example, we allocate an output buffer for every call node (next step) and copy the result in the very last buffer to the output tensor. - -**3. Generate the output buffer** - -Example Result: ``float* buf_0 = (float*)malloc(4 * 100);`` - -As mentioned in the previous step, in addition to the subgraph input and output tensors, we may also need buffers to keep the intermediate results. To generate the buffer, we extract the shape information to determine the buffer type and size: - -.. code-block:: c++ - - // This example only supports single output. - auto type_node = call->checked_type().as(); - ICHECK(type_node != nullptr && runtime::TypeMatch(type_node->dtype, kDLFloat, 32)) - << "Only support single output tensor with float type"; - - // Generate a unique buffer name. - std::string out = "buf_" + std::to_string(buf_idx_++); - - // Extract the shape to be the buffer size. - auto out_shape = GetShape(call->checked_type()); - int out_size = 1; - for (size_t i = 0; i < out_shape.size(); ++i) { - out_size *= out_shape[i]; - } - - // Make the buffer allocation and push to the buffer declarations. - buf_stream << "float* " << out << " = (float*)std::malloc(4 * " << out_size << ");"; - buf_decl_.push_back(buf_stream.str()); - -After we have allocated the output buffer, we can now close the function call string and push the generated function call to a class variable ``ext_func_body``. - -.. code-block:: c++ - - decl_stream << ", " << out << ");"; - ext_func_body.push_back(decl_stream.str()); - -**4. Update output buffer** - -To let the next node, which accepts the output of the current call node as its input, know which buffer it should take, we need to update the class variable ``out_`` before leaving this visit function: - -.. code-block:: c++ - - out_.clear(); - out_.push_back({out, out_size}); - -Congratulations! we have finished the most difficult function in this class. In the next two sections, we just need to make up some minor missing parts in this function. - -Code Generation for Input Variables ------------------------------------ - -Recall that we collected the input buffer information by visiting the arguments of a call node (2nd step in the previous section), and handled the case when its argument is another call node (4th step). In this section, we demonstrate how to handle other nodes by taking ``VarNode`` as an example. - -``VarNode`` represents input tensors in a model. The only but important information it has is a name hint (e.g., ``data``, ``weight``, etc). When visiting a ``VarNode``, we simply update class variable ``out_`` to pass the name hint so that the descendant call nodes can generate the correct function call. - -.. code-block:: c++ - - void VisitExpr_(const VarNode* node) { - ext_func_args_.push_back(node->name_hint()); - out_.clear(); - out_.push_back({node->name_hint(), 0}); - } - -Note that in this example we assume the subgraph we are offloading has only call nodes and variable nodes. If your subgraphs contain other types of nodes, such as ``TupleNode``, then you also need to visit them and bypass the output buffer information. - -Code Emitting -------------- - -The final part in this codegen class is a ``JIT`` function that emits a C function for the subgraph and uses the C code we just generated as the function body. Remember, in addition to the subgraph function we generated in the previous sections, we also need a wrapper function with a unified argument for TVM runtime to invoke and pass data. Fortunately, the base class we inherited already provides an implementation, ``JitImpl``, to generate the function. For example, we can invoke ``JitImpl`` as follows: - -.. code-block:: c++ - - JitImpl("gcc_0" /* Subgraph symbol (ID) */, - {"gcc_input0", "gcc_input1", "gcc_input2", "gcc_input3"} /* Input arguments */, - {"float *buf_0 = (float*)malloc(4 * 20)", ...} /* Buffer allocations */, - {"gcc_0_2(gcc_input0, gcc_input1, buf_0);"} /* Function body */, - {"out"} /* Output */); - -The above call will generate three functions (one from the TVM wrapper macro): - -1. The subgraph function ``gcc_0_`` (with one more underline at the end of the function name) with all C code we generated to execute a subgraph. - -2. The wrapper function ``gcc_0__wrapper_`` with a list of ``DLTensor`` arguments that casts data to the right type and invokes ``gcc_0_``. - -3. The TVM runtime compatible function ``gcc_0`` with TVM unified function arguments that unpacks TVM packed tensors and invokes ``gcc_0__wrapper_``. - -Accordingly, the only thing we need in ``JIT`` implementation is passing all subgraph function code we generated to ``JitImpl``: - -.. code-block:: c++ - - std::string JIT() { - // Write function macros - for (auto decl : func_decl_) { - code_stream_ << decl << "\n"; - } - return JitImpl(ext_func_id_, ext_func_args_, buf_decl_, ext_func_body, out_); - } - -All variables (``ext_func_id``, etc) we passed are class variables and were filled when we traversed the subgraph. - -Implement CSourceCodegen -======================== - -Again, let's create a class skeleton and implement the required functions. Note that it inherits ``CSourceModuleCodegenBase`` - -.. code-block:: c++ - - class CSourceCodegen : public CSourceModuleCodegenBase { - public: - // Pass a subgraph function, and generate the C code. - void GenCFunc(const Function& func) { ; } - - // Use GenCFunc to generate the C code and wrap it as a C source module. - runtime::Module CreateCSourceModule(const NodeRef& ref) override { ; } - - private: - std::ostringstream code_stream_; - }; - -Implement GenCFunc ------------------- - -``GenCFunc`` simply uses the ``CodegenC`` we just implemented to traverse a Relay function (subgraph) and obtains the generated C code. The builtin function ``GetExtSymbol`` retrieves a unique symbol name (e.g., ``gcc_0``) in the Relay function and we **must** use it as the C function name, because this symbol is going to be used for DSO runtime lookup. - -.. code-block:: c++ - - void GenCFunc(const Function& func) { - ICHECK(func.defined()) << "Input error: expect a Relay function."; - - // Record the external symbol for runtime lookup. - auto sid = GetExtSymbol(func); - - CodeGenC builder(sid); - builder.VisitExpr(func->body); - code_stream_ << builder.JIT(); - } - -Implement CreateCSourceModule ------------------------------ - -This function creates a runtime module for the external library. In this example, we create a CSourceModule that can be directly compiled and linked together with a TVM generated DSOModule. After you have implemented ``CodegenC``, implementing this function is relatively straightforward: - -.. code-block:: c++ - - runtime::Module CreateCSourceModule(const NodeRef& ref) override { - // Create headers - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - - // Append some common macro for operator definition. - const char* operator_macro = R"op_macro( - #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_) \ - extern "C" void p_ID_(float* a, float* b, float* out) { \ - for (int64_t i = 0; i < p_DIM1_; ++i) { \ - out[i] = a[i] p_OP_ b[i]; \ - } \ - } - - #define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_) \ - extern "C" void p_ID_(float* a, float* b, float* out) { \ - for (int64_t i = 0; i < p_DIM1_; ++i) { \ - for (int64_t j = 0; j < p_DIM2_; ++j) { \ - int64_t k = i * p_DIM2_ + j; \ - out[k] = a[k] p_OP_ b[k]; \ - } \ - } \ - } - )op_macro"; - - code_stream_ << operator_macro << "\n\n"; - - // Generate C code for the subgraph. - if (ref->IsInstance()) { - GenCFunc(Downcast(ref)); - } else if (ref->IsInstance()) { - relay::Module mod = Downcast(ref); - for (const auto& it : mod->functions) { - GenCFunc(Downcast(it.second)); - } - } else { - LOG(FATAL) << "The input ref is expected to be a Relay function or module" - << "\n"; - } - - // Create a CSourceModule - const auto* pf = runtime::Registry::Get("module.csource_module_create"); - ICHECK(pf != nullptr) << "Cannot find csource module to create the external runtime module"; - return (*pf)(code_stream_.str(), "cc"); - } - -Register Your Codegen -===================== - -The last step is registering your codegen to TVM backend. We first implement a simple function to invoke our codegen and generate a runtime module. - -.. code-block:: c++ - - runtime::Module CCompiler(const NodeRef& ref) { - CSourceCodegen csource; - return csource.CreateCSourceModule(ref); - } - -Finally, we register this function to TVM backend: - -.. code-block:: c++ - - TVM_REGISTER_GLOBAL("relay.ext.ccompiler").set_body_typed(CCompiler); - -where ``ccompiler`` is a customized tag to let TVM know this is the codegen it should use to generate and offload subgraphs when the subgraph is annotated with ``ccompiler``. - -Finally, a good practice is to set up a CMake configuration flag to include your compiler only for your customers. We first create a cmake file: ``cmake/modules/contrib/CODEGENC.cmake``: - -.. code-block:: cmake - - if(USE_CODEGENC) - file(GLOB CSOURCE_RELAY_CONTRIB_SRC src/relay/backend/contrib/codegen_c/codegen.cc) - list(APPEND COMPILER_SRCS ${CSOURCE_RELAY_CONTRIB_SRC}) - endif(USE_CODEGENC) - -So that users can configure whether to include your compiler when configuring TVM using ``config.cmake``: - -.. code-block:: cmake - - set(USE_CODEGENC ON) - -******************************************* -Implement a Codegen for Your Representation -******************************************* - -Although we have demonstrated how to implement a C codegen, your hardware may require other forms of graph representation, such as JSON. In this case, you could modify ``CodegenC`` class we have implemented to generate your own graph representation and implement a customized runtime module to let TVM runtime know how this graph representation should be executed. - -To simplify, we define a graph representation named "ExampleJSON" in this guide. ExampleJSON does not mean the real JSON but just a simple representation for graphs without a control flow. For example, assuming we have the following subgraph named ``subgraph_0``: - -:: - - input0 - | - add <-- input1 - | - subtract <-- input2 - | - multiply <-- input3 - | - out - -Then the ExampleJON of this subgraph looks like: - -.. code-block:: none - - subgraph_0 - input 0 10 10 - input 1 10 10 - input 2 10 10 - input 3 10 10 - add 4 inputs: 0 1 shape: 10 10 - sub 5 inputs: 4 2 shape: 10 10 - mul 6 inputs: 5 3 shape: 10 10 - -The ``input`` keyword declares an input tensor with its ID and shape; while the other statements describes computations in `` inputs: [input ID] shape: [shape]`` syntax. - -In this section, our goal is to implement the following customized TVM runtime module to execute ExampleJSON graphs. - -.. code-block:: c++ - - runtime::Module ExampleJsonCompiler(const NodeRef& ref) { - ExampleJsonCodeGen codegen(ref); - std::string code = codegen.gen(); // Note 1 - const auto* pf = runtime::Registry::Get("module.examplejson_module_create"); // Note 2 - ICHECK(pf != nullptr) << "Cannot find ExampleJson module to create the external runtime module"; - return (*pf)(code); - } - TVM_REGISTER_GLOBAL("relay.ext.examplejsoncompiler").set_body_typed(ExampleJsonCompiler); - -**Note 1**: We will implement a customized codegen later to generate a ExampleJSON code string by taking a subgraph. - -**Note 2**: This line obtains a pointer to a function for creating the customized runtime module. You can see that it takes subgraph code in ExampleJSON format we just generated and initializes a runtime module. - -In the following sections, we are going to introduce 1) how to implement ``ExampleJsonCodeGen`` and 2) how to implement and register ``examplejson_module_create``. - -Implement ExampleJsonCodeGen -============================ - -Similar to the C codegen, we also derive ``ExampleJsonCodeGen`` from ``ExprVisitor`` to make use of visitor patterns for subgraph traversing. On the other hand, we do not have to inherit ``CodegenCBase`` because we do not need TVM C++ wrappers. The codegen class is implemented as follows: - -.. code-block:: c++ - - #include - #include - #include - #include - #include - - #include - #include - - namespace tvm { - namespace relay { - namespace contrib { - - class ExampleJsonCodeGen : public ExprVisitor { - public: - explicit ExampleJsonCodeGen(); - - // Note 1 - void VisitExpr_(const VarNode* node) { /* Skip in this example. */ } - void VisitExpr_(const CallNode* call) final { /* Skip in this example. */ } - - // Note 2 - std::string gen(NodeRef& ref) { - this->code = ""; - if (ref->IsInstance()) { - this->visit(Downcast(ref)); - } else if (ref->IsInstance()) { - relay::Module mod = Downcast(ref); - for (const auto& it : mod->functions) { - this->visit(Downcast(it.second)); - } - } else { - LOG(FATAL) << "The input ref is expected to be a Relay function or module"; - } - return this->code; - } - - private: - /*! \brief The function id that represents a C source function. */ - std::string code; - } - -**Note 1**: We again implement corresponding visitor functions to generate ExampleJSON code and store it to a class variable ``code`` (we skip the visitor function implementation in this example as their concepts are basically the same as C codegen). After finished the graph visiting, we should have an ExampleJSON graph in ``code``. - -**Note 2**: We define an internal API ``gen`` to take a subgraph and generate a ExampleJSON code. This API can be in an arbitrary name you prefer. - -The next step is to implement a customized runtime to make use of the output of ``ExampleJsonCodeGen``. - -Implement a Customized Runtime -============================== - -In this section, we will implement a customized TVM runtime step-by-step and register it to TVM runtime modules. The customized runtime should be located at ``src/runtime/contrib//``. In our example, we name our runtime "example_ext_runtime". - -Again, we first define a customized runtime class as follows. The class has to be derived from TVM ``ModuleNode`` in order to be compatible with other TVM runtime modules. - -.. code-block:: c++ - - #include - #include - #include - #include - #include - #include - #include - #include - - #include - #include - #include - #include - #include - #include - - namespace tvm { - namespace runtime { - class ExampleJsonModule : public ModuleNode { - public: - explicit ExampleJsonModule(std::string graph_json); - - PackedFunc GetFunction(const std::string& name, - const ObjectPtr& sptr_to_self) final; - - const char* type_key() const { return "examplejson"; } - - void SaveToBinary(dmlc::Stream* stream) final; - - static Module LoadFromBinary(void* strm); - - static Module Create(const std::string& path); - - std::string GetSource(const std::string& format = ""); - - void Run(int id, const std::vector& inputs, int output); - - void ParseJson(const std::string& json); - - private: - /* \brief The json string that represents a computational graph. */ - std::string graph_json_; - /* \brief The subgraph that being processed. */ - std::string curr_subgraph_; - /*! \brief A simple graph from subgraph id to node entries. */ - std::map> graph_; - /* \brief A simple pool to contain the tensor for each node in the graph. */ - std::vector data_entry_; - /* \brief A mapping from node id to op name. */ - std::vector op_id_; - }; - -In particular, there are some functions derived from ``ModuleNode`` that we must implement in ``ExampleJsonModule``: - -* Constructor: The constructor of this class should accept a subgraph (in your representation), process and store it in any format you like. The saved subgraph could be used by the following two functions. - -* ``GetFunction``: This is the most important function in this class. When TVM runtime wants to execute a subgraph with your compiler tag, TVM runtime invokes this function from your customized runtime module. It provides the function name as well as runtime arguments, and ``GetFunction`` should return a packed function implementation for TVM runtime to execute. - -* ``SaveToBinary`` and ``LoadFromBinary``: ``SaveToBinary`` serialize the runtime module to a binary format for later deployment. This function will be called by TVM when users use ``export_library`` API. On the other hand, since we are now using our own graph representation, we have to make sure that ``LoadFromBinary`` is able to construct the same runtime module by taking the serialized binary generated by ``SaveToBinary``. - -* ``GetSource`` (optional): If you would like to see the generated ExampleJSON code, you can implement this function to dump it; otherwise you can skip the implementation. - -Other functions and class variables will be introduced along with the implementation of above must-have functions. - -Implement Constructor ---------------------- - -.. code-block:: c++ - - explicit ExampleJsonModule(std::string graph_json) { - this->graph_json_ = graph_json; - ParseJson(this->graph_json_); - } - -Then, we implement ``ParseJson`` to parse a subgraph in ExampleJSON format and construct a graph in memory for later usage. Since we do not support subgraph with branches in this example, we simply use an array to store every nodes in a subgraph in order. - -.. code-block:: c++ - - void ParseJson(const std::string& json) { - std::string line; - std::string curr_subgraph; - std::stringstream ss(json); - - while (std::getline(ss, line, '\n')) { - std::stringstream ss2(line); - std::string token; - int id = 0; - - ss2 >> token; - if (token.find("subgraph_") != std::string::npos) { - curr_subgraph = token; - continue; - } - - ss2 >> id; - if (op_id_.size() <= static_cast(id)) { - op_id_.resize(id + 1); - data_entry_.resize(id + 1); - } - - int64_t total_elements = 1; - std::vector shape; - if (token == "input") { - int64_t size = 0; - while (ss2 >> size) { - total_elements *= size; - shape.push_back(size); - } - } else { - op_id_[id] = token; // Note 1 - bool shape_data = false; - NodeEntry entry; - while (ss2 >> token) { - if (token == "shape:") { - shape_data = true; - } else if (shape_data) { - total_elements *= std::stoll(token); - shape.push_back(std::stoll(token)); - } else if (token != "inputs:") { - entry.inputs.push_back(std::stoi(token)); - } - } - entry.id = id; - entry.output = id; - graph_[curr_subgraph].push_back(entry); // Note 2 - } - DLDevice dev; - dev.device_type = static_cast(1); - dev.device_id = 0; - data_entry_[id] = NDArray::Empty(shape, DLDataType{kDLFloat, 32, 1}, dev); // Note 3 - } - } - -**Note 1**: We use a class variable ``op_id_`` to map from subgraph node ID to the operator name (e.g., ``add``) so that we can invoke the corresponding operator function in runtime. - -**Note 2**: We use a class variable ``graph_`` to map from subgraph name to an array of nodes. ``GetFunction`` will query graph nodes by a subgraph ID in runtime. - -**Note 3**: We use a class variable `data_entry_` to map from a subgraph node ID to a tensor data placeholder. We will put inputs and outputs to the corresponding data entry in runtime. - -Implement GetFunction ---------------------- - -After the construction, we should have the above class variables ready. We then implement ``GetFunction`` to provide executable subgraph functions to TVM runtime: - -.. code-block:: c++ - - PackedFunc GetFunction(const std::string& name, - const ObjectPtr& sptr_to_self) final { - if (this->graph_.find(name) != this->graph_.end()) { - this->curr_subgraph_ = name; - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - - // Copy input tensors to corresponding data entries. - for (auto i = 0; i < args.size(); ++i) { - ICHECK(args[i].type_code() == kNDArrayContainer || args[i].type_code() == kArrayHandle) - << "Expect NDArray or DLTensor as inputs\n"; - if (args[i].type_code() == kArrayHandle) { - DLTensor* arg = args[i]; - this->data_entry_[i].CopyFrom(arg); - } else { - NDArray arg = args[i]; - this->data_entry_[i].CopyFrom(arg); - } - } - - // Execute the subgraph. - for (const auto& it : this->graph_[this->curr_subgraph_]) { - this->Run(it.id, it.inputs, it.output); - } - ICHECK_GT(graph_.count(this->curr_subgraph_), 0U); - - // Copy the output from a data entry back to TVM runtime argument. - auto out_idx = graph_[this->curr_subgraph_].back().output; - if (args[args.size() - 1].type_code() == kArrayHandle) { - DLTensor* arg = args[args.size() - 1]; - this->data_entry_[out_idx].CopyTo(arg); - } else { - NDArray arg = args[args.size() - 1]; - this->data_entry_[out_idx].CopyTo(arg); - } - *rv = data_entry_.back(); - }); - } else { - LOG(FATAL) << "Unknown subgraph: " << name << "\n"; - return PackedFunc(); - } - } - -As can be seen, ``GetFunction`` is composed of three major parts. The first part copies data from TVM runtime arguments to the corresponding data entries we assigned in the constructor. The second part executes the subgraph with ``Run`` function (will implement later) and saves the results to another data entry. The third part copies the results from the output data entry back to the corresponding TVM runtime argument for output. - -Implement Run -------------- - -Now let's implement ``Run`` function. This function accepts 1) a subgraph ID, 2) a list of input data entry indexs, and 3) an output data entry index. - -.. code-block:: c++ - - void Run(int id, const std::vector& inputs, int output) { - // Make a list data entry indexs. - std::vector args(inputs.begin(), inputs.end()); - args.push_back(output); - - // Initialize data holders. - std::vector values(args.size()); - std::vector type_codes(args.size()); - - // Initialize a TVM arg setter with TVMValue and its type code. - TVMArgsSetter setter(values.data(), type_codes.data()); - - // Set each argument to its corresponding data entry. - if (op_id_[id] == "add" || op_id_[id] == "sub" || op_id_[id] == "mul") { - for (size_t i = 0; i < args.size(); i++) { - setter(i, data_entry_[args[i]]); - } - } - - // Invoke the corresponding operator function. - if (op_id_[id] == "add") { - Add(values.data(), type_codes.data(), args.size()); - } else if (op_id_[id] == "sub") { - Sub(values.data(), type_codes.data(), args.size()); - } else if (op_id_[id] == "mul") { - Mul(values.data(), type_codes.data(), args.size()); - } else { - LOG(FATAL) << "Unknown op: " << op_id_[id] << "\n"; - } - } - -``Run`` function mainly has two parts. The first part allocates a list of ``TVMValue``, and maps corresponding data entry blocks. This will become the arguments of our operator functions. The second part than invokes our operator functions. Although we use the same C functions as the previous example, you can replace ``Add``, ``Sub``, and ``Mul`` with your own engine. You only need to make sure your engine stores the results to the last argument so that they can be transferred back to TVM runtime. - -With above functions implemented, our customized codegen and runtime can now execute subgraphs. The last step is registering an API (``examplejson_module_create``) to create this module: - -.. code-block:: c++ - - TVM_REGISTER_GLOBAL("module.examplejson_module_create") - .set_body_typed([](std::string code){ - auto n = make_object(code); - return runtime::Module(n); - }); - -Implement SaveToBinary and LoadFromBinary ------------------------------------------ - -So far we have implemented the main features of a customized runtime so that it can be used as other TVM runtimes. However, when users want to save the built runtime to a disk for deployment, TVM has no idea about how to save it. This is the reason we want to implement ``SaveToBinary`` and ``LoadFromBinary``, which tell TVM how should this customized runtime be persist and restored. - -We first implement ``SaveToBinary`` function to allow users to save this module in disk. - -.. code-block:: c++ - - void SaveToBinary(dmlc::Stream* stream) final { - stream->Write(this->graph_json_); - } - -We can find that this function is pretty simple. Recall that the only argument we took in constructor is a subgraph representation, meaning that we only need a subgraph representation to construct/recover this customized runtime module. As a result, ``SaveToBinary`` simply writes the subgraph to an output DMLC stream. That is, when users use ``export_library`` API to export the module, the customized module will be an ExampleJSON stream of a subgraph. - -Similarity, ``LoadFromBinary`` reads the subgraph stream and re-constructs the customized runtime module: - -.. code-block:: c++ - - static Module LoadFromBinary(void* strm) { - dmlc::Stream* stream = static_cast(strm); - std::string graph_json; - stream->Read(&graph_json); - auto n = tvm::runtime::make_object(graph_json); - return Module(n); - } - -We also need to register this function to enable the corresponding Python API: - -.. code-block:: c++ - - TVM_REGISTER_GLOBAL("module.loadbinary_examplejson") - .set_body_typed(ExampleJsonModule::LoadFromBinary); - -The above registration means when users call ``tvm.runtime.load_module(lib_path)`` API and the exported library has an ExampleJSON stream, our ``LoadFromBinary`` will be invoked to create the same customized runtime module. - -In addition, if you want to support module creation directly from an ExampleJSON file, you can also implement a simple function and register a Python API as follows: - -.. code-block:: c++ - - static Module Create(const std::string& path) { - std::ifstream filep; - filep.open(path, std::ios::in); - std::string graph_json; - std::string line; - while (std::getline(filep, line)) { - graph_json += line; - graph_json += "\n"; - } - filep.close(); - auto n = tvm::runtime::make_object(graph_json); - return Module(n); - } - - TVM_REGISTER_GLOBAL("module.loadfile_examplejson") - .set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = ExampleJsonModule::Create(args[0]); - }); - -It means users can manually write/modify an ExampleJSON file, and use Python API ``tvm.runtime.load_module("mysubgraph.examplejson", "examplejson")`` to construct a customized module. - -******* -Summary -******* - -In summary, here is a checklist for you to refer: - -* A codegen class derived from ``ExprVisitor`` and ``CodegenCBase`` (only for C codegen) with following functions. - - * ``VisitExpr_(const CallNode* call)`` to collect call node information. - * Other visitor functions you needed to collect subgraph information. - * ``JIT`` to generate subgraph code. - * Register codegen. - -* A function to create ``CSourceModule`` (for C codegen). - -* A runtime module class derived from ``ModuleNode`` with following functions (for your graph representation). - - * Constructor. - * ``GetFunction`` to generate a TVM runtime compatible ``PackedFunc``. - * ``Run`` to execute a subgraph. - * Register a runtime creation API. - * ``SaveToBinary`` and ``LoadFromBinary`` to serialize/deserialize customized runtime module. - * Register ``LoadFromBinary`` API to support ``tvm.runtime.load_module(your_module_lib_path)``. - * (optional) ``Create`` to support customized runtime module construction from subgraph file in your representation. - -* An annotator to annotate a user Relay program to make use of your compiler and runtime (TBA). diff --git a/docs/dev/tutorial/codebase_walkthrough.rst b/docs/dev/tutorial/codebase_walkthrough.rst deleted file mode 100644 index a349b69f7b58..000000000000 --- a/docs/dev/tutorial/codebase_walkthrough.rst +++ /dev/null @@ -1,223 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -======================================= -TVM Codebase Walkthrough by Example -======================================= - -Getting to know a new codebase can be a challenge. This is especially true for a codebase like that of TVM, where different components interact in non-obvious ways. In this guide, we try to illustrate the key elements that comprise a compilation pipeline with a simple example. For each important step, we show where in the codebase it is implemented. The purpose is to let new developers and interested users dive into the codebase more quickly. - -******************************************* -Codebase Structure Overview -******************************************* - -At the root of the TVM repository, we have following subdirectories that together comprise a bulk of the codebase. - -- ``src`` - C++ code for operator compilation and deployment runtimes. -- ``src/relay`` - Implementation of Relay, a new functional IR for deep learning framework. -- ``python`` - Python frontend that wraps C++ functions and objects implemented in ``src``. -- ``src/topi`` - Compute definitions and backend schedules for standard neural network operators. - -Using standard Deep Learning terminology, ``src/relay`` is the component that manages a computational graph, and nodes in a graph are compiled and executed using infrastructure implemented in the rest of ``src``. ``python`` provides python bindings for the C++ API and driver code that users can use to execute compilation. Operators corresponding to each node are registered in ``src/relay/op``. Implementations of operators are in ``topi``, and they are coded in either C++ or Python. - -When a user invokes graph compilation by ``relay.build(...)``, the following sequence of actions happens for each node in the graph: - -- Look up an operator implementation by querying the operator registry -- Generate a compute expression and a schedule for the operator -- Compile the operator into object code - -One of the interesting aspects of the TVM codebase is that interoperability between C++ and Python is not unidirectional. Typically, all code that performs heavy lifting is implemented in C++, and Python bindings are provided for the user interface. This is also true in TVM, but in the TVM codebase, C++ code can also call into functions defined in a Python module. For example, the convolution operator is implemented in Python, and its implementation is invoked from C++ code in Relay. - -******************************************* -Vector Add Example -******************************************* - -We use a simple example that uses the low level TVM API directly. The example is vector addition, which is covered in detail in :ref:`tutorial-tensor-expr-get-started` - -:: - - n = 1024 - A = tvm.te.placeholder((n,), name='A') - B = tvm.te.placeholder((n,), name='B') - C = tvm.te.compute(A.shape, lambda i: A[i] + B[i], name="C") - -Here, types of ``A``, ``B``, ``C`` are ``tvm.tensor.Tensor``, defined in ``python/tvm/te/tensor.py``. The Python ``Tensor`` is backed by C++ ``Tensor``, implemented in ``include/tvm/te/tensor.h`` and ``src/te/tensor.cc``. All Python types in TVM can be thought of as a handle to the underlying C++ type with the same name. If you look at the definition of Python ``Tensor`` type below, you can see it is a subclass of ``Object``. - -:: - - @register_object - class Tensor(Object, _expr.ExprOp): - """Tensor object, to construct, see function.Tensor""" - - def __call__(self, *indices): - ... - -The object protocol is the basis of exposing C++ types to frontend languages, including Python. The way TVM implements Python wrapping is not straightforward. It is briefly covered in :ref:`tvm-runtime-system`, and details are in ``python/tvm/_ffi/`` if you are interested. - -We use the ``TVM_REGISTER_*`` macro to expose C++ functions to frontend languages, in the form of a :ref:`tvm-runtime-system-packed-func`. A ``PackedFunc`` is another mechanism by which TVM implements interoperability between C++ and Python. In particular, this is what makes calling Python functions from the C++ codebase very easy. -You can also checkout `FFI Navigator `_ which allows you to navigate between python and c++ FFI calls. - -A ``Tensor`` object has an ``Operation`` object associated with it, defined in ``python/tvm/te/tensor.py``, ``include/tvm/te/operation.h``, and ``src/tvm/te/operation`` subdirectory. A ``Tensor`` is an output of its ``Operation`` object. Each ``Operation`` object has in turn ``input_tensors()`` method, which returns a list of input ``Tensor`` to it. This way we can keep track of dependencies between ``Operation``. - -We pass the operation corresponding to the output tensor ``C`` to ``tvm.te.create_schedule()`` function in ``python/tvm/te/schedule.py``. - -:: - - s = tvm.te.create_schedule(C.op) - -This function is mapped to the C++ function in ``include/tvm/schedule.h``. - -:: - - inline Schedule create_schedule(Array ops) { - return Schedule(ops); - } - -``Schedule`` consists of collections of ``Stage`` and output ``Operation``. - -``Stage`` corresponds to one ``Operation``. In the vector add example above, there are two placeholder ops and one compute op, so the schedule ``s`` contains three stages. Each ``Stage`` holds information about a loop nest structure, types of each loop (``Parallel``, ``Vectorized``, ``Unrolled``), and where to execute its computation in the loop nest of the next ``Stage``, if any. - -``Schedule`` and ``Stage`` are defined in ``tvm/python/te/schedule.py``, ``include/tvm/te/schedule.h``, and ``src/te/schedule/schedule_ops.cc``. - -To keep it simple, we call ``tvm.build(...)`` on the default schedule created by ``create_schedule()`` function above, and we must add necessary thread bindings to make it runnable on GPU. - -:: - - target = "cuda" - bx, tx = s[C].split(C.op.axis[0], factor=64) - s[C].bind(bx, tvm.te.thread_axis("blockIdx.x")) - s[C].bind(tx, tvm.te.thread_axis("threadIdx.x")) - fadd = tvm.build(s, [A, B, C], target) - -``tvm.build()``, defined in ``python/tvm/driver/build_module.py``, takes a schedule, input and output ``Tensor``, and a target, and returns a :py:class:`tvm.runtime.Module` object. A :py:class:`tvm.runtime.Module` object contains a compiled function which can be invoked with function call syntax. - -The process of ``tvm.build()`` can be divided into two steps: - -- Lowering, where a high level, initial loop nest structures are transformed into a final, low level IR -- Code generation, where target machine code is generated from the low level IR - -Lowering is done by ``tvm.lower()`` function, defined in ``python/tvm/build_module.py``. First, bound inference is performed, and an initial loop nest structure is created. - -:: - - def lower(sch, - args, - name="default_function", - binds=None, - simple_mode=False): - ... - bounds = schedule.InferBound(sch) - stmt = schedule.ScheduleOps(sch, bounds) - ... - -Bound inference is the process where all loop bounds and sizes of intermediate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/te/schedule/bound.cc``, ``src/te/schedule/graph.cc`` and ``src/te/schedule/message_passing.cc``. - - -``stmt``, which is the output of ``ScheduleOps()``, represents an initial loop nest structure. If you have applied ``reorder`` or ``split`` primitives to your schedule, then the initial loop nest already reflects those changes. ``ScheduleOps()`` is defined in ``src/te/schedule/schedule_ops.cc``. - -Next, we apply a number of lowering passes to ``stmt``. These passes are implemented in ``src/tir/pass`` subdirectory. For example, if you have applied ``vectorize`` or ``unroll`` primitives to your schedule, they are applied in loop vectorization and unrolling passes below. - -:: - - ... - stmt = ir_pass.VectorizeLoop(stmt) - ... - stmt = ir_pass.UnrollLoop( - stmt, - cfg.auto_unroll_max_step, - cfg.auto_unroll_max_depth, - cfg.auto_unroll_max_extent, - cfg.unroll_explicit) - ... - -After lowering is done, ``build()`` function generates target machine code from the lowered function. This code can contain SSE or AVX instructions if you target x86, or PTX instructions for CUDA target. In addition to target specific machine code, TVM also generates host side code that is responsible for memory management, kernel launch etc. - -Code generation is done by ``build_module()`` function, defined in ``python/tvm/target/codegen.py``. On the C++ side, code generation is implemented in ``src/target/codegen`` subdirectory. ``build_module()`` Python function will reach ``Build()`` function below in ``src/target/codegen/codegen.cc``: - - - -The ``Build()`` function looks up the code generator for the given target in the ``PackedFunc`` registry, and invokes the function found. For example, ``codegen.build_cuda`` function is registered in ``src/codegen/build_cuda_on.cc``, like this: - -:: - - TVM_REGISTER_GLOBAL("codegen.build_cuda") - .set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = BuildCUDA(args[0]); - }); - -The ``BuildCUDA()`` above generates CUDA kernel source from the lowered IR using ``CodeGenCUDA`` class defined in ``src/codegen/codegen_cuda.cc``, and compile the kernel using NVRTC. If you target a backend that uses LLVM, which includes x86, ARM, NVPTX and AMDGPU, code generation is done primarily by ``CodeGenLLVM`` class defined in ``src/codegen/llvm/codegen_llvm.cc``. ``CodeGenLLVM`` translates TVM IR into LLVM IR, runs a number of LLVM optimization passes, and generates target machine code. - -The ``Build()`` function in ``src/codegen/codegen.cc`` returns a ``runtime::Module`` object, defined in ``include/tvm/runtime/module.h`` and ``src/runtime/module.cc``. A ``Module`` object is a container for the underlying target specific ``ModuleNode`` object. Each backend implements a subclass of ``ModuleNode`` to add target specific runtime API calls. For example, the CUDA backend implements ``CUDAModuleNode`` class in ``src/runtime/cuda/cuda_module.cc``, which manages the CUDA driver API. The ``BuildCUDA()`` function above wraps ``CUDAModuleNode`` with ``runtime::Module`` and return it to the Python side. The LLVM backend implements ``LLVMModuleNode`` in ``src/codegen/llvm/llvm_module.cc``, which handles JIT execution of compiled code. Other subclasses of ``ModuleNode`` can be found under subdirectories of ``src/runtime`` corresponding to each backend. - -The returned module, which can be thought of as a combination of a compiled function and a device API, can be invoked on TVM's NDArray objects. - -:: - - dev = tvm.device(target, 0) - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - fadd(a, b, c) - output = c.numpy() - -Under the hood, TVM allocates device memory and manages memory transfers automatically. To do that, each backend needs to subclass ``DeviceAPI`` class, defined in ``include/tvm/runtime/device_api.h``, and override memory management methods to use device specific API. For example, the CUDA backend implements ``CUDADeviceAPI`` in ``src/runtime/cuda/cuda_device_api.cc`` to use ``cudaMalloc``, ``cudaMemcpy`` etc. - -The first time you invoke the compiled module with ``fadd(a, b, c)``, ``GetFunction()`` method of ``ModuleNode`` is called to get a ``PackedFunc`` that can be used for a kernel call. For example, in ``src/runtime/cuda/cuda_module.cc`` the CUDA backend implements ``CUDAModuleNode::GetFunction()`` like this: - -:: - - PackedFunc CUDAModuleNode::GetFunction( - const std::string& name, - const std::shared_ptr& sptr_to_self) { - auto it = fmap_.find(name); - const FunctionInfo& info = it->second; - CUDAWrappedFunc f; - f.Init(this, sptr_to_self, name, info.arg_types.size(), info.launch_param_tags); - return PackFuncVoidAddr(f, info.arg_types); - } - -The ``PackedFunc``'s overloaded ``operator()`` will be called, which in turn calls ``operator()`` of ``CUDAWrappedFunc`` in ``src/runtime/cuda/cuda_module.cc``, where finally we see the ``cuLaunchKernel`` driver call: - -:: - - class CUDAWrappedFunc { - public: - void Init(...) - ... - void operator()(TVMArgs args, - TVMRetValue* rv, - void** void_args) const { - int device_id; - CUDA_CALL(cudaGetDevice(&device_id)); - if (fcache_[device_id] == nullptr) { - fcache_[device_id] = m_->GetFunc(device_id, func_name_); - } - CUstream strm = static_cast(CUDAThreadEntry::ThreadLocal()->stream); - ThreadWorkLoad wl = launch_param_config_.Extract(args); - CUresult result = cuLaunchKernel( - fcache_[device_id], - wl.grid_dim(0), - wl.grid_dim(1), - wl.grid_dim(2), - wl.block_dim(0), - wl.block_dim(1), - wl.block_dim(2), - 0, strm, void_args, 0); - } - }; - -This concludes an overview of how TVM compiles and executes a function. Although we did not detail TOPI or Relay, in the end, all neural network operators go through the same compilation process as above. You are encouraged to dive into the details of the rest of the codebase. diff --git a/docs/dev/tutorial/index.rst b/docs/dev/tutorial/index.rst deleted file mode 100644 index 77fbe0f5405e..000000000000 --- a/docs/dev/tutorial/index.rst +++ /dev/null @@ -1,29 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _dev-tutorial: - -Developer Tutorial -================== - -This section is a guide to the TVM codebase, and an introduction on how to -contribute to different parts of the platform. - -.. toctree:: - :maxdepth: 1 - - codebase_walkthrough diff --git a/docs/how_to/deploy/adreno.rst b/docs/how_to/deploy/adreno.rst deleted file mode 100644 index f0b8c6f757cb..000000000000 --- a/docs/how_to/deploy/adreno.rst +++ /dev/null @@ -1,676 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Deploy to Adreno™ GPU -===================== - -**Authors**: Daniil Barinov, Egor Churaev, Andrey Malyshev, Siva Rama Krishna - -Introduction ------------- - -Adreno™ is a series of graphics processing unit (GPU) semiconductor -intellectual property cores developed by Qualcomm and used in many of -their SoCs. - -The Adreno™ GPU accelerates the rendering of complex geometries to -deliver high-performance graphics and a rich user experience with low -power consumption. - -TVM supports deep learning acceleration on Adreno™ GPU by native OpenCL backend of TVM and -also through OpenCLML backend. Native OpenCL backend of TVM is enhanced to make it -Adreno™ friendly by incorporating texture memory usage and Adreno™ friendly layouts. -OpenCLML is an SDK release by Qualcomm that provides kernel acceleration library -for most of the deep learning operators. - -This guide is organized to demonstrate various design aspects of - -- :ref:`OpenCL Backend Ehnahcements` -- :ref:`About OpenCLML` -- :ref:`Build and Deploy` - - -.. _opencl_enhancements: - -OpenCL Backend Enhancements ---------------------------- - -OpenCL backend of TVM is enhanced to take advantage of Adreno™ specific features like -- Texture memory usage. -- Adreno™ friendly activation layouts. -- Brand new schedules to accelerate with above features. - -One of the Adreno™'s advantages is the clever handling of textures. At -the moment, TVM is able to benefit from this by having texture support -for Adreno™. The graph below shows the Adreno™ A5x architecture. - -|High-level overview of the Adreno™ A5x architecture for OpenCL| - -*Fig. 1 High-level overview of the Adreno™ A5x architecture for OpenCL* - -*source:* `OpenCL Optimization and Best Practices for Qualcomm Adreno™ GPUs `_ - -Reasons of using textures: - -- Texture processor (TP) has a dedicated L1 cache, which is read-only cache and stores data - fetched from level-2 (L2) cache for texture operations (primary - reason) - -- The handling of image boundaries is built-in. - -- Supports numerous image format and data type combinations with - support for automatic format conversions - -Overall, with textures, it is possible to achieve a significant performance boost -compared to OpenCL buffer based solutions. - -In general we specify target as ``target="opencl"`` for a regular OpenCL based target which generates the kernels as shown below. - -.. code:: c - - __kernel void tvmgen_default_fused_nn_conv2d_kernel0(__global float* restrict p0, __global double* restrict p1, __global float* restrict conv2d_nhwc) { - // body.. - -Above OpenCL kernel definition has ``__global float*`` poniters which are essestially OpenCL ``buffer`` objects. - -When enabled texture based enhancements by modifying target definition as ``target="opencl -device=adreno"`` we can see the generated -kernels using texture backed OpenCL image objects as shown below. - -.. code:: c - - __kernel void tvmgen_default_fused_nn_conv2d_kernel0(__write_only image2d_t pad_temp_global_texture, __read_only image2d_t p0) { - // body.. - -*image2d_t* is a built-in OpenCL types that represents two-dimensional image object and provides several additional functions. -When we use *image2d_t* we read *4 elements at one time*, and it helps to utilize hardware in a more efficient way. - -Please refer to :ref:`Advanced Usage` for more details about generation and inspection of kernel sources. - - -.. _about_openclml: - -About OpenCLML --------------- - -OpenCLML is a SDK released by Qualcomm that provides accelerated deep learning operators. -These operators are exposed as an extension ``cl_qcom_ml_ops`` to standard OpenCL specification. -Please refer `Accelerate your models with our OpenCL ML SDK `_ for more details. - -OpenCLML is integrated into TVM as a `BYOC `_ solution. -OpenCLML operators can use same context and can be enqueued on same command queue as used in native OpenCL. -We took advantage of this to avoid any context switching over heads while fallback to native OpenCL. - - -.. _build_deploy: - -TVM for Adreno™ ---------------- - -This section gives instructions about various ways of building and deploying model -to Adreno™ target. Adreno™ is a remote target which is connected to the host via ADB connection. -Deploying the compiled model here require use some tools on host as well as on target. - -TVM has simplified user friendly command line based tools as well as -developer centric python API interface for various steps like auto tuning, building and deploying. - - -|Adreno deployment pipeline| - -*Fig.2 Build and Deployment pipeline on Adreno devices* - -The figure above demonstrates a generalized pipeline for various stages listed below. - -**Model import:** -At this stage we import a model from well known frameworks like Tensorflow, PyTorch, ONNX ...etc. -This stage converts the given model into TVM's relay module format. Alternatively one can build a relay module manually -by using TVM's operator inventory too. TVM module generated here is a target independent representation of the graph. - -**Auto Tuning:** -At this stage we tune the TVM generated kernels specific to a target. Auto tuning process requires -target device availability and in case of a remote target like Adreno™ on Android device we use RPC Setup for communication. -Later sections in this guide will detail about RPC Setup for Android device. Auto tuning is not a necessary step for -compilation of a model. It is necessary for acheiving best performance out of TVM generated kernels. - -**Compilation:** -At this stage we compile the model for specific target. Given we auto tuned the module in previous stage, -TVM compilation make use of the tuning log for genetrating best performing kernels. TVM compilation process produces artifacts -containing kernel shared lib, graph definition in json format and parameters binary file in TVM specific format. - -**Deploy (or test run) on Target:** -At this stage we run the TVM compilation output on the target. Deployment is possible from python -environment using RPC Setup and also using TVM's native tool which is native binary cross compiled for Android. -At this stage we can run the compiled model on Android target and unit test output correctness and performance aspects. - -**Application Integration:** -This stage is all about integrating TVM compiled model in applications. Here we discuss about -interfacing tvm runtime from Android (cpp native environment or from JNI) for setting input and getting output. - -**Advanced Usage:** -This section advanced user interests like viewing generated source code, altering precision of the module ...etc. - - -This tutorial covers all the above aspects as part of below sections. - -- :ref:`Development environment` -- :ref:`RPC Setup` -- :ref:`Commandline tools` -- :ref:`Python interface` -- :ref:`Application Integration` -- :ref:`Advanced Usage` - -.. _development_environment: - - -Development Environment Setup : Automatic ------------------------------------------ -TVM ships a predefined docker container environment with all prerequisites to get started quickly. -You may also refer to :ref:`Manual Environment Setup` for more control on the dependencies. - -For docker setup the pre requisite is just docker tool availabilty on host. - -Below commands can build a docker image for adreno. - -:: - - ./docker/build.sh ci_adreno - docker tag tvm.ci_adreno ci_adreno - - -Now we can build both host and target utils with below command. - -:: - - ./tests/scripts/ci.py adreno -i - -To build TVM with OpenCLML SDK we need export the OpenCLML SDK as shown below while building - -:: - - export ADRENO_OPENCL= - ./tests/scripts/ci.py adreno -i - -On successful compilation this leaves us into a docker shell. The build leaves two folders - -* build-adreno: The host side TVM compiler build. -* build-adreno-target : Contains the android target components - - * libtvm_runtime.so : TVM runtime library - * tvm_rpc : The rpc runtime environment tool - * rtvm : A native stand alone tool - -While using docker environment the android device is shared with host. Hence, it is required -to have adb version ``1.0.41`` on the host as the docker used the same version. - -We can check adb devices availability inside docker environment too. - -:: - - user@ci-adreno-fpeqs:~$ adb devices - List of devices attached - aaaabbbb device - ccccdddd device - -.. _manual_setup: - -Development Environment Setup : Manual --------------------------------------- - -Manual build process require building of host and target components. - -Below command will configure the build the host compiler - -:: - - mkdir -p build - cd build - cp ../cmake/config.cmake . - - # Enable RPC capability to communicate to remote device. - echo set\(USE_RPC ON\) >> config.cmake - # We use graph executor for any host(x86) side verification of the model. - echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake - # Enable backtrace if possible for more ebug information on any crash. - echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake - # The target_host will be llvm. - echo set\(USE_LLVM ON\) >> config.cmake - -Additionally we can push below config entry to compile with OpenCLML support. - -:: - - export ADRENO_OPENCL= - echo set\(USE_CLML ${ADRENO_OPENCL}\) >> config.cmake - -now we can build as shown below - -:: - - cmake .. - make - -Finally we can export python path as - -:: - - export PYTHONPATH=$TVM_HOME/python:${PYTHONPATH} - python3 -c "import tvm" # Verify tvm python package - - -Now, we can configure and build the target components with below configuration -Target build require Android NDK to be installed. - -- Read documentation about *Android NDK installation* here: https://developer.android.com/ndk -- To get access to adb tools you can see *Android Debug Bridge installation* here: https://developer.android.com/studio/command-line/adb - - -:: - - mkdir -p build-adreno - cd build-adreno - cp ../cmake/config.cmake . - # Enable OpenCL backend. - echo set\(USE_OPENCL ON\) >> config.cmake - # Enable RPC functionality. - echo set\(USE_RPC ON\) >> config.cmake - # Build tvm_rpc tool that runs on target device. - echo set\(USE_CPP_RPC ON\) >> config.cmake - # Build native rtvm deploy tool. - echo set\(USE_CPP_RTVM ON\) >> config.cmake - # We use graph executor for deploying on devices like Android. - echo set\(USE_GRAPH_EXECUTOR ON\) >> config.cmake - # Backtrace enablement if possible. - echo set\(USE_LIBBACKTRACE AUTO\) >> config.cmake - # Adreno supports 32bit alignment for OpenCL allocations rather 64bit. - echo set\(USE_KALLOC_ALIGNMENT 32\) >> config.cmake - - # Android build related defines. - echo set\(ANDROID_ABI arm64-v8a\) >> config.cmake - echo set\(ANDROID_PLATFORM android-28\) >> config.cmake - echo set\(MACHINE_NAME aarch64-linux-gnu\) >> config.cmake - -Additionally we can push below config to compile with OpenCLML support. - -:: - - export ADRENO_OPENCL= - echo set\(USE_CLML "${ADRENO_OPENCL}"\) >> config.cmake - echo set\(USE_CLML_GRAPH_EXECUTOR "${ADRENO_OPENCL}"\) >> config.cmake - -For Android target build ``ANDROID_NDK_HOME`` is a dependency and we should have the same in the enviromnet variable. -Below commands will build Adreno™ target components - -:: - - cmake -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK_HOME}/build/cmake/android.toolchain.cmake" \ - -DANDROID_ABI=arm64-v8a \ - -DANDROID_PLATFORM=android-28 \ - -DCMAKE_SYSTEM_VERSION=1 \ - -DCMAKE_FIND_ROOT_PATH="${ADRENO_OPENCL}" \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DCMAKE_CXX_COMPILER="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang++" \ - -DCMAKE_C_COMPILER="${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang" \ - -DMACHINE_NAME="aarch64-linux-gnu" .. - - make tvm_runtime tvm_rpc rtvm - - -.. _rpc_setup: - -RPC Setup ---------- - -RPC Setup allows remote target access over TCP/IP networking interface. RPC Setup is essential for auto tuning stage as tuning -involves running of auto generated kernels on real device and optimize the same by using machine learning approach. Please refer -`Auto-Tune with Templates and AutoTVM `_ got more details about AutoTVM. - -RPC Setup is also useful to deply the compiled model to a remote device from python interface or ``tvmc`` tool from host device. - -RPC Setup has multiple components as listed below. - -**TVM Tracker:** -TVM tracker is a host side daemon that manages remote devices and serve them to host side applications. Applications -can connect to this tracker and acquire a remote device handle to communicate. - -**TVM RPC:** -TVM RPC is a native application that runs on the remote device (Android in our case) and registers itself to the TVM Tracker -running on the host. - - -Hence, for RPC based setup we will have above components running on host and target device. Below sections explain how to setup the same -manually and also inside docker using automated tools. - -**Automated RPC Setup:** -Here we will explain how to setup RPC in docker environment. - -Below command launches tracker in docker environment, where tracker listens on port 9190. - -:: - - ./tests/scripts/ci.py adreno -i # Launch a new shell on the anreno docker - source tests/scripts/setup-adreno-env.sh -e tracker -p 9190 - -Now, the below comand can run TVM RPC on remote android device with id ``abcdefgh``. - - -:: - - ./tests/scripts/ci.py adreno -i # Launch a new shell on adreno docker. - source tests/scripts/setup-adreno-env.sh -e device -p 9190 -d abcdefgh - -Further, below command can be used to query the RPC setup details on any other docker terminals. - -:: - - ./tests/scripts/ci.py adreno -i # Launch a new shell on adreno docker. - source tests/scripts/setup-adreno-env.sh -e query -p 9190 - - -**Manual RPC Setup:** - -Please refer to the tutorial -`How To Deploy model on Adreno `_ -for manual RPC environment setup. - -This concludes RPC Setup and we have rpc-tracker available on host ``127.0.0.1`` (rpc-tracker) and port ``9190`` (rpc-port). - - -.. _commandline_interface: - -Commandline Tools ------------------ - -Here we describe entire compilation process using command line tools. TVM has command line utility -`tvmc `_ to perform -model import, auto tuning, compilation and deply over rpc. -`tvmc `_ has many options to explore and try. - -**Model Import & Tuning:** -Use the below command to import a model from any framework and auto tune the same. -Here we use a model from Keras and it uses RPC setup for tuning and finally generates tuning log file -``keras-resnet50.log``. - -:: - - python3 -m tvm.driver.tvmc tune --target="opencl -device=adreno" \ - --target-host="llvm -mtriple=aarch64-linux-gnu" \ - resnet50.h5 -o \ - keras-resnet50.log \ - --early-stopping 0 --repeat 30 --rpc-key android \ - --rpc-tracker 127.0.0.1:9190 --trials 1024 \ - --tuning-records keras-resnet50-records.log --tuner xgb - -**Model Compilation:** - -Use below command for compiling the model and produce TVM compiler outputs. - -:: - - python3 -m tvm.driver.tvmc compile \ - --cross-compiler ${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang \ - --target="opencl, llvm" --target-llvm-mtriple aarch64-linux-gnu --target-opencl-device adreno \ - --tuning-records keras-resnet50.log -o keras-resnet50.tar resnet50.h5 - -While enabled OpenCLML offloading we need to add target ``clml`` as shown below. Tuning log is valid for OpenCLML offloading also -as the OpenCL path is fallback option for any operator didn't go through OpenCLML path. The tuning log will be used for such operators. - -:: - - python3 -m tvm.driver.tvmc compile \ - --cross-compiler ${ANDROID_NDK_HOME}/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang \ - --target="opencl, clml, llvm" --desired-layout NCHW --target-llvm-mtriple aarch64-linux-gnu --target-opencl-device adreno \ - --tuning-records keras-resnet50.log -o keras-resnet50.tar resnet50.h5 - -On successful compilation, above command produce ``keras-resnet50.tar``. -It is a compressed archive with kernel shared lib(mod.so), graph json(mod.json) and params binary(mod.params). - -**Deploy & Run on Target:** - -Running the compiled model on Android target is possible in RPC way as well as native deployment. - -We can use below tvmc command to deploy on remore target via RPC based setup. - -:: - - python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar \ - --rpc-key android --rpc-tracker 127.0.0.1:9190 --print-time - -`tvmc `_ based run has more options -to initialize the input in various modes like fill, random ..etc. - -``tvmc`` based deployment generally a quick verification of compiled model on target from remote host via RPC setup. - -Production generally uses native deploymenmt environment like Android JNI or CPP native environments. -Here we need to use cross compiled ``tvm_runtime`` interface to deploy the tvm compilation output, i.e. ``TVMPackage``. - -TVM has a standalone tool ``rtvm`` to deploy and run the model natively on ADB shell. The build process produces this tool under build-adreno-target. -Please refer to `rtvm `_ for more details about this tool. - -While integrating inside existing Android application TVM has multiple options. For JNI or CPP native we may use `C Runtime API `_ -You may refer to ``rtvm``'s simplified interface `TVMRunner `_ also. - -.. _python_interface: - -Python Interface ----------------- - -This section explains importing, auto tuning, compiling and running a model using python interface.\ -TVM has a high level interface through ``tvmc`` abstraction as well as low level relay api. We will discuss about both of these in details. - -**TVMC Interface:** - -While using ``tvmc`` python interface we first load a model that produces ``TVMCModel``. ``TVMCModel`` will be used for Auto Tuning to produce tuning cache. -Compilation process uses ``TVMCModel`` and tuning cache (optional) to produce ``TVMCPackage``. Now, ``TVMCPackage`` will be saved to file system or -can be used to deploy and run on target device. - -Please refer to the tutorial for the same -`How To Deploy model on Adreno using TVMC `_ - -Saved ``TVMCPackage`` can be used for native deployment using ``rtvm`` utility too. - -Also, please refer to `tvmc `_ -documentation for more details about the api interface. - -**Relay Interface:** - -Relay api interface gives lower level api access to the tvm compiler interface. -Similar to ``tvmc`` interface relay api interface provides various frontend API to convert models to a relay ``Module``. -Relay ``Module`` will be used for all kinds transforms like precision conversions, CLML offloading and other custom transforms if any. -The resulting Module will be used for Auto Tuning too. Finally, we use ``relay.build`` API to generate library module. -From this library module, we can export compilation artifacts like module shared library (mod.so), params(mod.params) and json graph(mod.json). -This library module will be used to create graph runtime to deploy and run on target device. - -Please refer to the tutorial `How To Deploy model on Adreno `_ -for a step by step explanation of the same. - -Additionally, TVM also supports Java interface through `TVM4J `_ - -.. _application_integration: - -Application Integration ------------------------ - -TVM compilation output is represented as module shared lib (mod.so), graph json(mod.json) and params (mod.params). -Archived representation of TVMPackage is also contains the same. - -In general a CPP/C based interface will be sufficient for any Android application integration. - -TVM natively expose ``c_runtime_api`` for loading a TVM compiled module and run the same. - -Alternatively one may refer to `cpp_rtvm `_ -``TVMRunner`` interface too for further simplified version of the same. - - - -.. _advanced_usage: - -Advanced Usage --------------- - -This section details some of the advanced usage and additional information while using Adreno™ target on TVM. - -Generated Source Inspection -~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Apart from standard tvm compilation artifacts kernel library (mod.so), graph (mod.json) and params (mod.params) -we can also generate opencl kernel source, clml offloaded graph ...etc from lib handle as shown below. -TVM compilation output is organized as a TVM module and many other TVM modules imported into it. - -Below snippet can dump CLML sub graphs in json format. - -.. code:: python - - # Look for "clml" typed module imported. - clml_modules = list(filter(lambda mod: mod.type_key == "clml", lib.get_lib().imported_modules)) - # Loop through all clml sub graphs and dump the json formatted CLML sub graphs. - for cmod in clml_modules: - print("CLML Src:", cmod.get_source()) - - -Similarly, below snippet can extract opencl kernel source from the compiled TVM module. - -.. code:: python - - # Similarly we can dump open kernel source too as shown below - # Look for "opencl" typed module imported. - opencl_modules = list(filter(lambda mod: mod.type_key == "opencl", lib.get_lib().imported_modules)) - # Now dump kernel source for each OpenCL targetted sub graph. - for omod in opencl_modules: - print("OpenCL Src:", omod.get_source()) - - -Precisions -~~~~~~~~~~ -The right choice of precision for a specific workload can greatly increase the efficiency of the solution, -shifting the initial balance of precision and speed to the side that is a priority for the problem. - -We can choose from *float16*, *float16_acc32* (Mixed Precision), *float32* (standard). - -**Float16** - -To leverage the GPU hardware capabilities and utilize the benefits of half precision computation and memory management, -we can convert an original model having floating points operation to a model operating with half precision. -Choosing lower precision will positively affect the performance of the model, but it may also have a decrease in the accuracy of the model. - -To do the conversion you need to call adreno specific transformation API as soon as relay module is generated through any frontend. - -.. code:: python - - from tvm.driver.tvmc.transform import apply_graph_transforms - mod = apply_graph_transforms( - mod, - { - "mixed_precision": True, - "mixed_precision_ops": ["nn.conv2d", "nn.dense"], - "mixed_precision_calculation_type": "float16", - "mixed_precision_acc_type": "float16", - }, - ) - - -``tvm.driver.tvmc.transform.apply_graph_transforms`` is simplified API over ``ToMixedPrecision`` pass to get desired precision. - -We can then compile our model in any convinient way - -.. code:: python - - with tvm.transform.PassContext(opt_level=3): - lib = relay.build( - mod, target_host=target_host, target=target, params=params - ) - -While using ``tvmc`` python interface, the below arguments enables precision conversion to float16. - -.. code:: python - - mixed_precision = True, - mixed_precision_ops = ["nn.conv2d", "nn.dense"], - mixed_precision_calculation_type = "float16", - mixed_precision_acc_type = "float16" - -Similarly, ``tvmc`` command line interface option bas below listed options. - -.. code:: bash - - --mixed-precision - --mixed-precision-ops nn.conv2d nn.dense - --mixed-precision-calculation-type float16 - --mixed-precision-acc-type float16 - - -**float16_acc32 (Mixed Precision)** - -``ToMixedPrecision`` pass traverse over the network and split network to clusters of ops dealing with float or float16 data types. -The clusters are defined by three types of operations: -- Operations always be converted into float16 data type -- Operations which can be converted if they followed by converted cluster -- Operations never be converted to the float16 data type -This list is defined in the ToMixedPrecision implementation here -`relay/transform/mixed_precision.py `_ -and can be overridden by user. - -The ``ToMixedPrecision`` method is a pass to convert an FP32 relay graph into an FP16 version (with -FP16 or FP32 accumulation dtypes). Doing this transformation is useful for reducing model size -as it halves the expected size of the weights (FP16_acc16 case). - -``ToMixedPrecision`` pass usage is simplified into a simple call as shown below for usage. - -.. code:: python - - from tvm.driver.tvmc.transform import apply_graph_transforms - mod = apply_graph_transforms( - mod, - { - "mixed_precision": True, - "mixed_precision_ops": ["nn.conv2d", "nn.dense"], - "mixed_precision_calculation_type": "float16", - "mixed_precision_acc_type": "float32", - }, - ) - - -``tvm.driver.tvmc.transform.apply_graph_transforms`` is simplified API over ``ToMixedPrecision`` pass to get desired precision. - -We can then compile our model in any convinient way - -.. code:: python - - with tvm.transform.PassContext(opt_level=3): - lib = relay.build( - mod, target_host=target_host, target=target, params=params - ) - -While using ``tvmc`` python interface, the below arguments enables precision conversion to float16. - -.. code:: python - - mixed_precision = True, - mixed_precision_ops = ["nn.conv2d", "nn.dense"], - mixed_precision_calculation_type = "float16", - mixed_precision_acc_type = "float32" - -Similarly, ``tvmc`` command line interface option bas below listed options. - -.. code:: bash - - --mixed-precision - --mixed-precision-ops nn.conv2d nn.dense - --mixed-precision-calculation-type float16 - --mixed-precision-acc-type float32 - - -.. |High-level overview of the Adreno™ A5x architecture for OpenCL| image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/how-to/adreno_architecture.png -.. |Adreno deployment pipeline| image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/how-to/Adreno-Deployment-Pipeline.jpg diff --git a/docs/how_to/deploy/android.rst b/docs/how_to/deploy/android.rst deleted file mode 100644 index 2f5469740192..000000000000 --- a/docs/how_to/deploy/android.rst +++ /dev/null @@ -1,42 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Deploy to Android -================= - -Build model for Android Target ------------------------------- - -Relay compilation of model for android target could follow same approach like android_rpc. -The code below will save the compilation output which is required on android target. - - -.. code:: python - - lib.export_library("deploy_lib.so", fcompile=ndk.create_shared) - with open("deploy_graph.json", "w") as fo: - fo.write(graph.json()) - with open("deploy_param.params", "wb") as fo: - fo.write(runtime.save_param_dict(params)) - -deploy_lib.so, deploy_graph.json, deploy_param.params will go to android target. - -TVM Runtime for Android Target ------------------------------- - -Refer `here `_ to build CPU/OpenCL version flavor TVM runtime for android target. -From android java TVM API to load model & execute can be referred at this `java `_ sample source. diff --git a/docs/how_to/deploy/arm_compute_lib.rst b/docs/how_to/deploy/arm_compute_lib.rst deleted file mode 100644 index 31b815d05d19..000000000000 --- a/docs/how_to/deploy/arm_compute_lib.rst +++ /dev/null @@ -1,264 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Relay Arm\ :sup:`®` Compute Library Integration -=============================================== -**Author**: `Luke Hutton `_ - -Introduction ------------- - -Arm Compute Library (ACL) is an open source project that provides accelerated kernels for Arm CPU's -and GPU's. Currently the integration offloads operators to ACL to use hand-crafted assembler -routines in the library. By offloading select operators from a relay graph to ACL we can achieve -a performance boost on such devices. - -Installing Arm Compute Library ------------------------------- - -Before installing Arm Compute Library, it is important to know what architecture to build for. One way -to determine this is to use `lscpu` and look for the "Model name" of the CPU. You can then use this to -determine the architecture by looking online. - -TVM only supports a single version of ACL, currently this is v21.08, there are two recommended ways to build and install -the required libraries: - -* Use the script located at `docker/install/ubuntu_download_arm_compute_lib_binaries.sh`. You can use this - script for downloading ACL binaries for the architecture and extensions specified in `target_lib`, these - will be installed to the location denoted by `install_path`. -* Alternatively, you can download the pre-built binaries from: - https://github.com/ARM-software/ComputeLibrary/releases. When using this package, you will need to - select the binaries for the architecture and extensions you require, then make sure they are visible - to CMake: - - .. code:: bash - - cd /lib - mv .//* . - - -In both cases you will need to set USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR to the path where the ACL package -is located. CMake will look in /path-to-acl/ along with /path-to-acl/lib and /path-to-acl/build for the -required binaries. See the section below for more information on how to use these configuration options. - -Building with ACL support -------------------------- - -The current implementation has two separate build options in CMake. The reason for this split is -because ACL cannot be used on an x86 machine. However, we still want to be able compile an ACL -runtime module on an x86 machine. - -* USE_ARM_COMPUTE_LIB=ON/OFF - Enabling this flag will add support for compiling an ACL runtime module. -* USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR=ON/OFF/path-to-acl - Enabling this flag will allow the graph executor to - compute the ACL offloaded functions. - -These flags can be used in different scenarios depending on your setup. For example, if you want -to compile an ACL module on an x86 machine and then run the module on a remote Arm device via RPC, you will -need to use USE_ARM_COMPUTE_LIB=ON on the x86 machine and USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR=ON on the remote -AArch64 device. - -By default both options are set to OFF. Using USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR=ON will mean that ACL -binaries are searched for by CMake in the default locations -(see https://cmake.org/cmake/help/v3.4/command/find_library.html). In addition to this, -/path-to-tvm-project/acl/ will also be searched. It is likely that you will need to set your own path to -locate ACL. This can be done by specifying a path in the place of ON. - -These flags should be set in your config.cmake file. For example: - -.. code:: cmake - - set(USE_ARM_COMPUTE_LIB ON) - set(USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR /path/to/acl) - - -Usage ------ - -.. note:: - - This section may not stay up-to-date with changes to the API. - -Create a relay graph. This may be a single operator or a whole graph. The intention is that any -relay graph can be input. The ACL integration will only pick supported operators to be offloaded -whilst the rest will be computed via TVM. (For this example we will use a single -max_pool2d operator). - -.. code:: python - - import tvm - from tvm import relay - - data_type = "float32" - data_shape = (1, 14, 14, 512) - strides = (2, 2) - padding = (0, 0, 0, 0) - pool_size = (2, 2) - layout = "NHWC" - output_shape = (1, 7, 7, 512) - - data = relay.var('data', shape=data_shape, dtype=data_type) - out = relay.nn.max_pool2d(data, pool_size=pool_size, strides=strides, layout=layout, padding=padding) - module = tvm.IRModule.from_expr(out) - - -Annotate and partition the graph for ACL. - -.. code:: python - - from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib - module = partition_for_arm_compute_lib(module) - - -Build the Relay graph. - -.. code:: python - - target = "llvm -mtriple=aarch64-linux-gnu -mattr=+neon" - with tvm.transform.PassContext(opt_level=3, disabled_pass=["AlterOpLayout"]): - lib = relay.build(module, target=target) - - -Export the module. - -.. code:: python - - lib_path = '~/lib_acl.so' - cross_compile = 'aarch64-linux-gnu-c++' - lib.export_library(lib_path, cc=cross_compile) - - -Run Inference. This must be on an Arm device. If compiling on x86 device and -running on AArch64, consider using the RPC mechanism. :ref:`Tutorials for using -the RPC mechanism ` - -.. code:: python - - dev = tvm.cpu(0) - loaded_lib = tvm.runtime.load_module('lib_acl.so') - gen_module = tvm.contrib.graph_executor.GraphModule(loaded_lib['default'](dev)) - d_data = np.random.uniform(0, 1, data_shape).astype(data_type) - map_inputs = {'data': d_data} - gen_module.set_input(**map_inputs) - gen_module.run() - - -More examples -------------- -The example above only shows a basic example of how ACL can be used for offloading a single -Maxpool2D. If you would like to see more examples for each implemented operator and for -networks refer to the tests: `tests/python/contrib/test_arm_compute_lib`. Here you can modify -`test_config.json` to configure how a remote device is created in `infrastructure.py` and, -as a result, how runtime tests will be run. - -An example configuration for `test_config.json`: - -* connection_type - The type of RPC connection. Options: local, tracker, remote. -* host - The host device to connect to. -* port - The port to use when connecting. -* target - The target to use for compilation. -* device_key - The device key when connecting via a tracker. -* cross_compile - Path to cross compiler when connecting from a non-arm platform e.g. aarch64-linux-gnu-g++. - -.. code:: json - - { - "connection_type": "local", - "host": "127.0.0.1", - "port": 9090, - "target": "llvm -mtriple=aarch64-linux-gnu -mattr=+neon", - "device_key": "", - "cross_compile": "" - } - - -Operator support ----------------- -+----------------------+-------------------------------------------------------------------------+ -| Relay Node | Remarks | -+======================+=========================================================================+ -| nn.conv2d | fp32: | -| | Simple: nn.conv2d | -| | Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu? | -| | | -| | Normal and depth-wise (when kernel is 3x3 or 5x5 and strides are 1x1 | -| | or 2x2) convolution supported. Grouped convolution is not supported. | -+----------------------+-------------------------------------------------------------------------+ -| qnn.conv2d | uint8: | -| | Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?, qnn.requantize | -| | | -| | Normal and depth-wise (when kernel is 3x3 or 5x5 and strides are 1x1 | -| | or 2x2) convolution supported. Grouped convolution is not supported. | -+----------------------+-------------------------------------------------------------------------+ -| nn.dense | fp32: | -| | Simple: nn.dense | -| | Composite: nn.dense, nn.bias_add? | -+----------------------+-------------------------------------------------------------------------+ -| qnn.dense | uint8: | -| | Composite: qnn.dense, nn.bias_add?, qnn.requantize | -+----------------------+-------------------------------------------------------------------------+ -| nn.max_pool2d | fp32, uint8 | -+----------------------+-------------------------------------------------------------------------+ -| nn.global_max_pool2d | fp32, uint8 | -+----------------------+-------------------------------------------------------------------------+ -| nn.avg_pool2d | fp32: | -| | Simple: nn.avg_pool2d | -| | | -| | uint8: | -| | Composite: cast(int32), nn.avg_pool2d, cast(uint8) | -+----------------------+-------------------------------------------------------------------------+ -| nn.global_avg_pool2d | fp32: | -| | Simple: nn.global_avg_pool2d | -| | | -| | uint8: | -| | Composite: cast(int32), nn.avg_pool2d, cast(uint8) | -+----------------------+-------------------------------------------------------------------------+ -| power(of 2) + | A special case for L2 pooling. | -| nn.avg_pool2d + | | -| sqrt | fp32: | -| | Composite: power(of 2), nn.avg_pool2d, sqrt | -+----------------------+-------------------------------------------------------------------------+ -| reshape | fp32, uint8 | -+----------------------+-------------------------------------------------------------------------+ -| maximum | fp32 | -+----------------------+-------------------------------------------------------------------------+ -| add | fp32 | -+----------------------+-------------------------------------------------------------------------+ -| qnn.add | uint8 | -+----------------------+-------------------------------------------------------------------------+ - -.. note:: - A composite operator is a series of operators that map to a single Arm Compute Library operator. You can view this - as being a single fused operator from the view point of Arm Compute Library. '?' denotes an optional operator in - the series of operators that make up a composite operator. - - -Adding a new operator ---------------------- -Adding a new operator requires changes to a series of places. This section will give a hint on -what needs to be changed and where, it will not however dive into the complexities for an -individual operator. This is left to the developer. - -There are a series of files we need to make changes to: - -* `python/relay/op/contrib/arm_compute_lib.py` In this file we define the operators we wish to offload using the - `op.register` decorator. This will mean the annotation pass recognizes this operator as ACL offloadable. -* `src/relay/backend/contrib/arm_compute_lib/codegen.cc` Implement `Create[OpName]JSONNode` method. This is where we - declare how the operator should be represented by JSON. This will be used to create the ACL module. -* `src/runtime/contrib/arm_compute_lib/acl_runtime.cc` Implement `Create[OpName]Layer` method. This is where we - define how the JSON representation can be used to create an ACL function. We simply define how to - translate from the JSON representation to ACL API. -* `tests/python/contrib/test_arm_compute_lib` Add unit tests for the given operator. diff --git a/docs/how_to/deploy/bnns.rst b/docs/how_to/deploy/bnns.rst deleted file mode 100644 index b2afa5001d26..000000000000 --- a/docs/how_to/deploy/bnns.rst +++ /dev/null @@ -1,184 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Relay BNNS Integration -====================== -**Author**: `Egor Churaev `_ - -Introduction ------------- - -Apple BNNS library is a collection of functions that can be used to construct neural networks -for inference (and train). It’s supported in macOS, iOS, tvOS, and watchOS. BNNS provides -primitives executed on all CPU supported on those platforms and optimized for high performance -and low-energy consumption. This integration will offload as many operators as possible from Relay to BNNS. - -BNNS runtime is a part of platform API and available on all modern Apple operating systems. -Application using BNNS will not depends on any additional external dependencies. - -BNNS functions uses Apple private hardware capabilities which are not exposed yet by Apple. Example -of such capabilities can be AMX Apple cpu extension. - -This guide will demonstrate how to build TVM with BNNS codegen and runtime enabled. It will also provide example -code to compile and run models using BNNS runtime. Finally, we document the supported operators. - -Building TVM with BNNS support ------------------------------- - -To turn on TVM BNNS codegen and TVM BNNS runtime you need to turn on the only USE_BNNS flag - -* USE_BNNS=ON/OFF - This flag will enable compiling a network with offloading subgraphs to BNNS primitives - and will link tvm library to the BNNS runtime module. - -Enabling of this flag will cause to search the default Accelerate Frameworks on current target SDK. -The minimal versions of required SDK is macOS 11.0, iOS 14.0, tvOS 14.0 and watchOS 7.0. - -Example setting in config.cmake file: - -.. code:: cmake - - set(USE_BNNS ON) - -BNNS partitioning of Relay graph --------------------------------- - -Operations to be offloaded on BNNS execution must be annotated before passing of module for compilation. -All ops annotated by `partition_for_bnns` will be offloaded for BNNS execution. The rest of the ops -will go through the LLVM compilation and code generation. - -Important note: BNNS support primitives only with constant weights. To satisfy this requirements we have -to map constants to related tensor abstraction in relay representation. To freeze tensors and operate -with them as constants you may need to call ONNX importer with special flag "freeze_params=True" -or performer binding manually. In general cases all relay importers don't do that by default. -For your convenience "partition_for_bnns" can do this for you if params dictionary is passed as the argument. - -.. code:: python - - from tvm.relay.op.contrib.bnns import partition_for_bnns - model = partition_for_bnns(model, params=params) - - -Input data layout for operations to be offloaded to BNNS execution ------------------------------------------------------------------- - -BNNS kernels support only planar format of input data. The partitioner will require to have NCHW input -layout for conv2d input. - -To use BNNS integration for models with interleave input layout, they should be converted before -passing of module to `partition_for_bnns`. The layout conversion will happen only for explicitly -enumerated types of ops. It might happen that depending on topology there might be regular data reorder -around conv2d to interleave and planar layout. This will be reflected in performance penalties and affect -execution time. It is recommended to analyze the whole topology and extend below list to convert all -intermediate tensors to NCHW data layout. - -Example of input layouts change: - -.. code:: python - - # For models with NHWC input layout - with tvm.transform.PassContext(opt_level=3): - mod = relay.transform.InferType()(mod) - mod = relay.transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"], - "nn.bias_add": ["NCHW", "default"], - "nn.relu": ["NCHW"]})(mod) - - -Example: Build and Deploy Mobilenet v2 1.0 with BNNS ----------------------------------------------------- - -Create a Relay graph from a MXNet Mobilenet v2 1.0 model. - -.. code:: python - - import tvm - from tvm import relay - import mxnet - from mxnet.gluon.model_zoo.vision import get_model - - dtype = "float32" - input_shape = (1, 3, 224, 224) - block = get_model('mobilenetv2_1.0', pretrained=True) - module, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype) - - -Markup the parts of graphs to be offloaded to BNNS primitives. All ops which are supported by the BNNS -integration will be handled by BNNS invocations, the rest of the ops will go through the -regular TVM llvm compilation and code generation. - -After that you need to compile new module with target corresponding to required Apple platform - -.. code:: python - - from tvm.relay.op.contrib.bnns import partition_for_bnns - - # target for macOS Big Sur 11.1: - target = "llvm -mtriple=x86_64-apple-darwin20.2.0" - - model = partition_for_bnns(model, params=params) # to markup operations to be offloaded to BNNS - with tvm.transform.PassContext(opt_level=3): - lib = relay.build(model, target=target, params=params) - -Export the module. - -.. code:: python - - lib.export_library('compiled.dylib') - - -Load module and run inference on the target machine with TVM built with ``USE_BNNS`` enabled - -.. code:: python - - import tvm - import numpy as np - from tvm.contrib import graph_executor - - dev = tvm.cpu(0) - loaded_lib = tvm.runtime.load_module('compiled.dylib') - gen_module = tvm.contrib.graph_executor.GraphModule(loaded_lib['default'](dev)) - - dtype = "float32" - input_shape = (1, 3, 224, 224) - input_data = np.random.uniform(0, 1, input_shape).astype(dtype) - gen_module.run(data=input_data) - - - -Operator support ----------------- - -+------------------------+------------------------------------------------------------------------------+ -| Relay Node | Remarks | -+========================+==============================================================================+ -| nn.conv2d | | -+------------------------+------------------------------------------------------------------------------+ -| nn.batch_norm | Supported by BNNS integration only in nn.conv2d-batch_norm pattern | -+------------------------+------------------------------------------------------------------------------+ -| nn.dense | | -+------------------------+------------------------------------------------------------------------------+ -| nn.batch_matmul | | -+------------------------+------------------------------------------------------------------------------+ -| nn.bias_add | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense | -| | fusion | -+------------------------+------------------------------------------------------------------------------+ -| add | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense | -| | fusion | -+------------------------+------------------------------------------------------------------------------+ -| nn.relu | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion | -+------------------------+------------------------------------------------------------------------------+ -| nn.gelu | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion | -+------------------------+------------------------------------------------------------------------------+ diff --git a/docs/how_to/deploy/cpp_deploy.rst b/docs/how_to/deploy/cpp_deploy.rst deleted file mode 100644 index 44df1e55b58e..000000000000 --- a/docs/how_to/deploy/cpp_deploy.rst +++ /dev/null @@ -1,56 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - - -Deploy TVM Module using C++ API -=============================== - -We provide an example on how to deploy TVM modules in `apps/howto_deploy `_ - -To run the example, you can use the following command - - -.. code:: bash - - cd apps/howto_deploy - ./run_example.sh - - -Get TVM Runtime Library ------------------------ - -The only thing we need is to link to a TVM runtime in your target platform. -TVM provides a minimum runtime, which costs around 300K to 600K depending on how much modules we use. -In most cases, we can use ``libtvm_runtime.so`` that comes with the build. - -If somehow you find it is hard to build ``libtvm_runtime``, checkout -`tvm_runtime_pack.cc `_. -It is an example all in one file that gives you TVM runtime. -You can compile this file using your build system and include this into your project. - -You can also checkout `apps `_ for example applications build with TVM on iOS, Android and others. - -Dynamic Library vs. System Module ---------------------------------- -TVM provides two ways to use the compiled library. -You can checkout `prepare_test_libs.py `_ -on how to generate the library and `cpp_deploy.cc `_ on how to use them. - -- Store library as a shared library and dynamically load the library into your project. -- Bundle the compiled library into your project in system module mode. - -Dynamic loading is more flexible and can load new modules on the fly. System module is a more ``static`` approach. We can use system module in places where dynamic library loading is banned. diff --git a/docs/how_to/deploy/hls.rst b/docs/how_to/deploy/hls.rst deleted file mode 100644 index d172332ba85c..000000000000 --- a/docs/how_to/deploy/hls.rst +++ /dev/null @@ -1,182 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - - -HLS Backend Example -=================== - -TVM supports Xilinx FPGA board with SDAccel. Here is a tutorial for how to deploy TVM to AWS F1 FPGA instance. - -.. note:: - - This feature is still experimental. We cannot use SDAccel to deploy an end to end neural networks for now. - -We use two python scripts for this tutorial. - -- build.py - a script to synthesize FPGA bitstream. - - .. code:: python - - import tvm - from tvm import te - - tgt= tvm.target.Target("sdaccel", host="llvm") - - n = te.var("n") - A = te.placeholder((n,), name='A') - B = te.placeholder((n,), name='B') - C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - - s = te.create_schedule(C.op) - px, x = s[C].split(C.op.axis[0], nparts=1) - - s[C].bind(px, tvm.te.thread_axis("pipeline")) - - fadd = tvm.build(s, [A, B, C], tgt, name="myadd") - - fadd.save("myadd.o") - fadd.imported_modules[0].save("myadd.xclbin") - - tvm.contrib.cc.create_shared("myadd.so", ["myadd.o"]) - -- run.py - a script to use FPGA as an accelerator. - - .. code:: python - - import tvm - import numpy as np - import os - - tgt="sdaccel" - - fadd = tvm.runtime.load_module("myadd.so") - if os.environ.get("XCL_EMULATION_MODE"): - fadd_dev = tvm.runtime.load_module("myadd.xclbin") - else: - fadd_dev = tvm.runtime.load_module("myadd.awsxclbin") - fadd.import_module(fadd_dev) - - dev = tvm.device(tgt, 0) - - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype("float32"), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype("float32"), dev) - c = tvm.nd.array(np.zeros(n, dtype="float32"), dev) - - fadd(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - - -Setup ------ - -- Launch an instance using the FPGA Developer AMI. We don't need an F1 instance for emulation and synthesis, so it is recommended to use a lower cost instance for them. -- Setup AWS FPGA development kit. - - .. code:: bash - - git clone https://github.com/aws/aws-fpga.git - cd aws-fpga - source sdaccel_setup.sh - source ${XILINX_SDX}/settings64.sh - -- Setup TVM with OpenCL enabled. - -Emulation ---------- - -- Create emconfig.json for emulation. - - .. code:: bash - - emconfigutil --platform ${AWS_PLATFORM} --nd 1 - -- Copy emconfig.json to the python binary directory. It is because the current Xilinx toolkit assumes that both host binary and the emconfig.json file are in the same path. - - .. code:: bash - - cp emconfig.json $(dirname $(which python)) - -- Run software emulation - - .. code:: bash - - export XCL_EMULATION_MODE=1 - export XCL_TARGET=sw_emu - - python build.py - python run.py - -- Run hardware emulation - - .. code:: bash - - export XCL_EMULATION_MODE=1 - export XCL_TARGET=hw_emu - - python build.py - python run.py - -Synthesis ---------- - -- Run synthesis with the following script. - - .. code:: bash - - unset XCL_EMULATION_MODE - export XCL_TARGET=hw - - python build.py - -- Create AWS FPGA image and upload it to AWS S3. - - .. code:: bash - - ${SDACCEL_DIR}/tools/create_sdaccel_afi.sh \ - -xclbin=myadd.xclbin -o=myadd \ - -s3_bucket= -s3_dcp_key= \ - -s3_logs_key= - - This also generates an awsxclbin file, which is necessary to use the AWS FPGA image on F1 instances. - -Run ---- - -- Launch Amazon EC2 F1 instance. -- Copy ``myadd.so``, ``myadd.awsxclbin``, and ``run.py`` to the F1 instance. -- Setup AWS FPGA development kit. - - .. code:: bash - - git clone https://github.com/aws/aws-fpga.git - cd aws-fpga - source sdaccel_setup.sh - -- Setup TVM with OpenCL enabled. -- Become root and setup environment variables. - - .. code:: bash - - sudo sh - source ${INSTALL_ROOT}/setup.sh - -- Run - - .. code:: bash - - python run.py diff --git a/docs/how_to/deploy/index.rst b/docs/how_to/deploy/index.rst deleted file mode 100644 index 4c3f30964b63..000000000000 --- a/docs/how_to/deploy/index.rst +++ /dev/null @@ -1,191 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _deploy-and-integration: - -Deploy Models and Integrate TVM -=============================== - -This page contains guidelines on how to deploy TVM to various platforms -as well as how to integrate it with your project. - -.. image:: https://tvm.apache.org/images/release/tvm_flexible.png - -Build the TVM runtime library ------------------------------ - -.. _build-tvm-runtime-on-target-device: - -Unlike traditional deep learning frameworks. TVM stack is divided into two major components: - -- TVM compiler, which does all the compilation and optimizations of the model -- TVM runtime, which runs on the target devices. - -In order to integrate the compiled module, we **do not** need to build entire -TVM on the target device. You only need to build the TVM compiler stack on your -desktop and use that to cross-compile modules that are deployed on the target device. - -We only need to use a light-weight runtime API that can be integrated into various platforms. - -For example, you can run the following commands to build the runtime API -on a Linux based embedded system such as Raspberry Pi: - -.. code:: bash - - git clone --recursive https://github.com/apache/tvm tvm - cd tvm - mkdir build - cp cmake/config.cmake build - cd build - cmake .. - make runtime - -Note that we type ``make runtime`` to only build the runtime library. - -It is also possible to cross compile the runtime. Cross compiling -the runtime library should not be confused with cross compiling models -for embedded devices. - -If you want to include additional runtime such as OpenCL, -you can modify ``config.cmake`` to enable these options. -After you get the TVM runtime library, you can link the compiled library - -.. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/dev/tvm_deploy_crosscompile.svg - :align: center - :width: 85% - -A model (optimized or not by TVM) can be cross compiled by TVM for -different architectures such as ``aarch64`` on a ``x64_64`` host. Once the model -is cross compiled it is necessary to have a runtime compatible with the target -architecture to be able to run the cross compiled model. - - -Cross compile the TVM runtime for other architectures ------------------------------------------------------ - -In the example :ref:`above ` the runtime library was -compiled on a Raspberry Pi. Producing the runtime library can be done much faster on -hosts that have high performace processors with ample resources (such as laptops, workstation) -compared to a target devices such as a Raspberry Pi. In-order to cross compile the runtime the toolchain -for the target device must be installed. After installing the correct toolchain, -the main difference compared to compiling natively is to pass some additional command -line argument to cmake that specify a toolchain to be used. For reference -building the TVM runtime library on a modern laptop (using 8 threads) for ``aarch64`` -takes around 20 seconds vs ~10 min to build the runtime on a Raspberry Pi 4. - -cross-compile for aarch64 -""""""""""""""""""""""""" - -.. code-block:: bash - - sudo apt-get update - sudo apt-get install gcc-aarch64-linux-gnu g++-aarch64-linux-gnu - -.. code-block:: bash - - cmake .. \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_VERSION=1 \ - -DCMAKE_C_COMPILER=/usr/bin/aarch64-linux-gnu-gcc \ - -DCMAKE_CXX_COMPILER=/usr/bin/aarch64-linux-gnu-g++ \ - -DCMAKE_FIND_ROOT_PATH=/usr/aarch64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DMACHINE_NAME=aarch64-linux-gnu - - make -j$(nproc) runtime - -For bare metal ARM devices the following toolchain is quite handy to install instead of gcc-aarch64-linux-* - -.. code-block:: bash - - sudo apt-get install gcc-multilib-arm-linux-gnueabihf g++-multilib-arm-linux-gnueabihf - - -cross-compile for RISC-V -""""""""""""""""""""""""" - -.. code-block:: bash - - sudo apt-get update - sudo apt-get install gcc-riscv64-linux-gnu g++-riscv64-linux-gnu - - -.. code-block:: bash - - cmake .. \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_SYSTEM_VERSION=1 \ - -DCMAKE_C_COMPILER=/usr/bin/riscv64-linux-gnu-gcc \ - -DCMAKE_CXX_COMPILER=/usr/bin/riscv64-linux-gnu-g++ \ - -DCMAKE_FIND_ROOT_PATH=/usr/riscv64-linux-gnu \ - -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \ - -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \ - -DMACHINE_NAME=riscv64-linux-gnu - - make -j$(nproc) runtime - -The ``file`` command can be used to query the architecture of the produced runtime. - - -.. code-block:: bash - - file libtvm_runtime.so - libtvm_runtime.so: ELF 64-bit LSB shared object, UCB RISC-V, version 1 (GNU/Linux), dynamically linked, BuildID[sha1]=e9ak845b3d7f2c126dab53632aea8e012d89477e, not stripped - - -Optimize and tune models for target devices -------------------------------------------- - -The easiest and recommended way to test, tune and benchmark TVM kernels on -embedded devices is through TVM's RPC API. -Here are the links to the related tutorials. - -- :ref:`tutorial-cross-compilation-and-rpc` -- :ref:`tutorial-deploy-model-on-rasp` - -Deploy optimized model on target devices ----------------------------------------- - -After you finished tuning and benchmarking, you might need to deploy the model on the -target device without relying on RPC. See the following resources on how to do so. - -.. toctree:: - :maxdepth: 2 - - cpp_deploy - android - adreno - integrate - hls - arm_compute_lib - tensorrt - vitis_ai - bnns - mrvl - -Additional Deployment How-Tos ------------------------------ - -We have also developed a number of how-tos targeting specific devices, with -working Python code that can be viewed in a Jupyter notebook. These how-tos -describe how to prepare and deploy models to many of the supported backends. - -.. toctree:: - :maxdepth: 1 - - ../deploy_models/index diff --git a/docs/how_to/deploy/integrate.rst b/docs/how_to/deploy/integrate.rst deleted file mode 100644 index fe40f6897335..000000000000 --- a/docs/how_to/deploy/integrate.rst +++ /dev/null @@ -1,70 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Integrate TVM into Your Project -=============================== - -TVM's runtime is designed to be lightweight and portable. -There are several ways you can integrate TVM into your project. - -This article introduces possible ways to integrate TVM -as a JIT compiler to generate functions on your system. - - -DLPack Support --------------- - -TVM's generated function follows the PackedFunc convention. -It is a function that can take positional arguments including -standard types such as float, integer, string. -The PackedFunc takes DLTensor pointer in `DLPack `_ convention. -So the only thing you need to solve is to create a corresponding DLTensor object. - - - -Integrate User Defined C++ Array --------------------------------- - -The only thing we have to do in C++ is to convert your array to DLTensor and pass in its address as -``DLTensor*`` to the generated function. - - -Integrate User Defined Python Array ------------------------------------ - -Assume you have a python object ``MyArray``. There are three things that you need to do - -- Add ``_tvm_tcode`` field to your array which returns ``tvm.TypeCode.ARRAY_HANDLE`` -- Support ``_tvm_handle`` property in your object, which returns the address of DLTensor in python integer -- Register this class by ``tvm.register_extension`` - -.. code:: python - - # Example code - import tvm - - class MyArray(object): - _tvm_tcode = tvm.TypeCode.ARRAY_HANDLE - - @property - def _tvm_handle(self): - dltensor_addr = self.get_dltensor_addr() - return dltensor_addr - - # You can put registration step in a separate file mypkg.tvm.py - # and only optionally import that if you only want optional dependency. - tvm.register_extension(MyArray) diff --git a/docs/how_to/deploy/mrvl.rst b/docs/how_to/deploy/mrvl.rst deleted file mode 100644 index a0876fbe5aec..000000000000 --- a/docs/how_to/deploy/mrvl.rst +++ /dev/null @@ -1,277 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - - -Marvell Machine Learning Integration -==================================== - -1. Introduction ---------------- -Marvell(R) supports a family of high performance Data Processing -Units (DPUs) with integrated compute, high speed I/O and workload -accelerators. These workload accelerators includes Marvell's -Machine Learning Inference Processor (MLIP), a highly optimized, -integrated inference engine. - -TVM supports Marvell's MLIP using the "mrvl" library. This partitions and -compiles supported operations for accelerated execution on MLIP, or LLVM -for general compute. - -For runtime, the library supports native execution on MLIP hardware -as well as Marvell's ML simulator (mrvl-mlsim). - -The library supports Marvell's Octeon family of processors with ML accelarators. - -This guide demonstrates building TVM with codegen and -runtime enabled. It also provides example code to compile and run -models using 'mrvl' runtime. - -2. Building TVM with mrvl support ---------------------------------- - -2.1 Clone TVM repo -------------------- - -Refer to the following TVM documentation for cloning TVM -https://tvm.apache.org/docs/install/from_source.html - -2.2 Build and start the TVM - mrvl docker container ----------------------------------------------------- - -.. code:: bash - - ./docker/build.sh demo_mrvl bash # Build the docker container - ./docker/bash.sh tvm.demo_mrvl # Load the docker image - -3. Compiling a model using TVMC command line --------------------------------------------- -Models can be compiled and run for mrvl target using TVMC -which is optimized for performance. - -Refer to the following TVMC documentation, for tvmc generic options. -https://tvm.apache.org/docs/tutorial/tvmc_command_line_driver.html - -Additional mrvl-specific options may be added as attributes if -necessary. The advanced usage is described in this document below. - -3.1 TVMC Compilation Flow for a model -------------------------------------- - -Refer to the following TVM documentation, for compilation flow -https://tvm.apache.org/docs/arch/index.html#example-compilation-flow - - -3.2. TVMC - Command line option(s): Syntax for mrvl target ----------------------------------------------------------- - -Compiling an ONNX model using the tvmc for mrvl target. - -**Syntax:** - -.. code:: python - - python3 -m tvm.driver.tvmc compile --target="mrvl, llvm" - --target-llvm- - --target-mrvl- - -- - model_file.onnx - -Following is an example TVMC Compile command for an ARMv9 core and -integrated MLIP cn10ka processor, using only 4 tiles in the block. - -**Example:** - -.. code:: python - - python3 -m tvm.driver.tvmc compile --target="mrvl, llvm" \ - --target-llvm-mtriple=aarch64-linux-gnu --target-llvm-mcpu=neoverse-n2 \ - --target-mrvl-num_tiles=4 \ - --target-mrvl-mattr="hw -quantize=fp16 -wb_pin_ocm=1" \ - --cross-compiler aarch64-linux-gnu-gcc \ - --output model.tar \ - mnist-12.onnx - - -3.3. TVMC Compiler: mrvl specific Command Line Options ------------------------------------------------------- - -.. code:: python - - --target-mrvl-mcpu - --target-mrvl-num_tiles - --target-mrvl-mattr - -**Description of mrvl options** - -* mcpu: - The CPU class of Marvell(R) ML Inference Processor; - possible values = {cn10ka, cnf10kb}; defaults to cn10ka - -* num_tiles: - Maximum number of tiles that may be used, possible values = {1,2,4,8}, defaults to 8 - -* mattr: - Attributes for mrvl; possible values = {quantize, wb_pin_ocm, run_mode} - - mattr specifies the data type, code generation options and optimizations. - - *List of supported attributes are:* - - **1. quantize** - - Specify the data type. Possible values = {fp16, int8}. - Default is fp16, int8 is WIP and full support will be added in a future PR. - - **2. wb_pin_ocm** - - Optimize runtime by preloading a model's weights and bias into - the on chip memory. Possible values = {0, 1}. Default is 0 (no preload) - - **3. run_mode** - - Specify whether to compile for the simulator or for the target hardware (Octeon). - Possible values = {sim, hw}. Default is sim (software simulator). - -4. Compile ONNX model using the TVMC flow ------------------------------------------ - -In the TVMC mrvl flow, the model is partitioned into Marvell and LLVM regions. -Building each partitioned Marvell subgraph generates serialized nodes.json and -const.json. Partitioned nodes.json is the representation of the model graph which is -suitable for the Marvell compiler (mrvl-tmlc). The compiler compiles the model graph to -generate the model binary with MLIP instructions. - -4.1 Compile and Run ONNX model for Simulator + LLVM / x86_64 target --------------------------------------------------------------------- - -**Model Compilation for Simulator + LLVM / x86_64 target** - -.. code:: python - - python3 -m tvm.driver.tvmc compile --target="mrvl, llvm" \ - --target-mrvl-num_tiles=4 --output model.tar model.onnx - -**Run TVM models on x86_64 host using MLIP Simulator** - -Generated model binary is simulated using Marvell's MLIP Simulator(mrvl-mlsim). - -.. code:: python - - python3 -m tvm.driver.tvmc run --inputs infer.npz --outputs predict.npz model.tar --number=0 - -4.2 Compile and Run ONNX model for Octeon target ----------------------------------------------------------- - -**Model Compilation for Octeon target** - -Please refer to section 3.2 for the example command line. - -**Run TVM models on the Octeon Target** - -The cross compiled binary can be run on the target hardware using the tvmc run command. -Alternatively, the RPC flow enables remote execution on the target device from your -local machine: https://tvm.apache.org/docs/how_to/tutorials/cross_compilation_and_rpc.html - -.. code:: python - - python3 -m tvm.driver.tvmc run --inputs infer.npz --outputs predict.npz model.tar - -5. Compiling a model using Python APIs --------------------------------------- - -In addition to using TVMC, models can also be compiled and run using -TVM Python API. Below is an example to compile and run the MNIST model. - -**Download MNIST model from the web** - -.. code:: bash - - cd $HOME - wget https://github.com/onnx/models/raw/main/validated/vision/classification/mnist/model/mnist-12.onnx - -**Import the TVM and other dependent modules** - -.. code:: python - - import tvm, onnx - import numpy as np - import tvm.relay as relay - from tvm.contrib import graph_executor - from tvm.relay.op.contrib.mrvl import partition_for_mrvl - from tvm.relay.build_module import build - from keras.datasets import mnist - -**Load model onnx file** - -.. code:: python - - onnx_model = onnx.load("mnist-12.onnx") - -**Create a Relay graph from MNIST model** - -.. code:: python - - shape_dict = {'Input3' : (1,1,28,28)} - mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) - -**Define option dictionary and Partition the Model** - -Annotate and partition the graph for mrvl. All operations which are supported -by the mrvl will be marked and offloaded to mrvl hardware accelerator. The rest of the -operations will go through the regular LLVM compilation and code generation for ARM. - -.. code:: python - - tvm_target = "llvm" - - option_dict = {'num_tiles': 4} - - mod = partition_for_mrvl(mod, params, **option_dict) - -**Build the Relay Graph** - -Build the Relay graph, using the new module returned by partition_for_mrvl. - -.. code:: python - - with tvm.transform.PassContext(opt_level=3, config={"relay.ext.mrvl.options" : option_dict}): - model_lib = relay.build(mod, tvm_target, params=params) - -**Generate runtime graph of the model library** - -.. code:: python - - dev = tvm.cpu() - model_rt_graph = graph_executor.GraphModule(model_lib["default"](dev)) - -**Get test data and initialize model input** - -.. code:: python - - (train_X, train_y), (test_X, test_y) = mnist.load_data() - image = tvm.nd.array(test_X[0].reshape(1, 1, 28, 28).astype("float32") / 255) - inputs_dict = {} - inputs_dict["Input3"] = image - model_rt_graph.set_input(**inputs_dict) - -**Run Inference and print the output** - -.. code:: python - - model_rt_graph.run() - output_tensor = model_rt_graph.get_output(0).numpy() - print (output_tensor) diff --git a/docs/how_to/deploy/tensorrt.rst b/docs/how_to/deploy/tensorrt.rst deleted file mode 100644 index cb1b309db0f0..000000000000 --- a/docs/how_to/deploy/tensorrt.rst +++ /dev/null @@ -1,315 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Relay TensorRT Integration -========================== -**Author**: `Trevor Morris `_ - -Introduction ------------- - -NVIDIA TensorRT is a library for optimized deep learning inference. This integration will offload as -many operators as possible from Relay to TensorRT, providing a performance boost on NVIDIA GPUs -without the need to tune schedules. - -This guide will demonstrate how to install TensorRT and build TVM with TensorRT BYOC and runtime -enabled. It will also provide example code to compile and run a ResNet-18 model using TensorRT and -how to configure the compilation and runtime settings. Finally, we document the supported operators -and how to extend the integration to support other operators. - -Installing TensorRT -------------------- - -In order to download TensorRT, you will need to create an NVIDIA Developer program account. Please -see NVIDIA's documentation for more info: -https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html. If you have a Jetson device -such as a TX1, TX2, Xavier, or Nano, TensorRT will already be installed on the device via the -JetPack SDK. - -There are two methods to install TensorRT: - -* System install via deb or rpm package. -* Tar file installation. - -With the tar file installation method, you must provide the path of the extracted tar archive to -USE_TENSORRT_RUNTIME=/path/to/TensorRT. With the system install method, -USE_TENSORRT_RUNTIME=ON will automatically locate your installation. - -Building TVM with TensorRT support ----------------------------------- - -There are two separate build flags for TensorRT integration in TVM. These flags also enable -cross-compilation: USE_TENSORRT_CODEGEN=ON will also you to build a module with TensorRT support on -a host machine, while USE_TENSORRT_RUNTIME=ON will enable the TVM runtime on an edge device to -execute the TensorRT module. You should enable both if you want to compile and also execute models -with the same TVM build. - -* USE_TENSORRT_CODEGEN=ON/OFF - This flag will enable compiling a TensorRT module, which does not require any - TensorRT library. -* USE_TENSORRT_RUNTIME=ON/OFF/path-to-TensorRT - This flag will enable the TensorRT runtime module. - This will build TVM against the installed TensorRT library. - -Example setting in config.cmake file: - -.. code:: cmake - - set(USE_TENSORRT_CODEGEN ON) - set(USE_TENSORRT_RUNTIME /home/ubuntu/TensorRT-7.0.0.11) - - -Build and Deploy ResNet-18 with TensorRT ----------------------------------------- - -Create a Relay graph from a MXNet ResNet-18 model. - -.. code:: python - - import tvm - from tvm import relay - import mxnet - from mxnet.gluon.model_zoo.vision import get_model - - dtype = "float32" - input_shape = (1, 3, 224, 224) - block = get_model('resnet18_v1', pretrained=True) - mod, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype) - - -Annotate and partition the graph for TensorRT. All ops which are supported by the TensorRT -integration will be marked and offloaded to TensorRT. The rest of the ops will go through the -regular TVM CUDA compilation and code generation. - -.. code:: python - - from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt - mod = partition_for_tensorrt(mod, params) - - -Build the Relay graph, using the new module and config returned by partition_for_tensorrt. The -target must always be a cuda target. ``partition_for_tensorrt`` will automatically fill out the -required values in the config, so there is no need to modify it - just pass it along to the -PassContext so the values can be read during compilation. - -.. code:: python - - target = "cuda" - with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) - - -Export the module. - -.. code:: python - - lib.export_library('compiled.so') - - -Load module and run inference on the target machine, which must be built with -``USE_TENSORRT_RUNTIME`` enabled. The first run will take longer because the TensorRT engine will -have to be built. - -.. code:: python - - dev = tvm.cuda(0) - loaded_lib = tvm.runtime.load_module('compiled.so') - gen_module = tvm.contrib.graph_executor.GraphModule(loaded_lib['default'](dev)) - input_data = np.random.uniform(0, 1, input_shape).astype(dtype) - gen_module.run(data=input_data) - - -Partitioning and Compilation Settings -------------------------------------- - -There are some options which can be configured in ``partition_for_tensorrt``. - -* ``version`` - TensorRT version to target as tuple of (major, minor, patch). If TVM is compiled - with USE_TENSORRT_RUNTIME=ON, the linked TensorRT version will be used instead. The version - will affect which ops can be partitioned to TensorRT. -* ``use_implicit_batch`` - Use TensorRT implicit batch mode (default true). Setting to false will - enable explicit batch mode which will widen supported operators to include those which modify the - batch dimension, but may reduce performance for some models. -* ``remove_no_mac_subgraphs`` - A heuristic to improve performance. Removes subgraphs which have - been partitioned for TensorRT if they do not have any multiply-accumulate operations. The removed - subgraphs will go through TVM's standard compilation instead. -* ``max_workspace_size`` - How many bytes of workspace size to allow each subgraph to use for - TensorRT engine creation. See TensorRT documentation for more info. Can be overriden at runtime. - - -Runtime Settings ----------------- - -There are some additional options which can be configured at runtime using environment variables. - -* Automatic FP16 Conversion - Environment variable ``TVM_TENSORRT_USE_FP16=1`` can be set to - automatically convert the TensorRT components of your model to 16-bit floating point precision. - This can greatly increase performance, but may cause some slight loss in the model accuracy. -* Caching TensorRT Engines - During the first inference, the runtime will invoke the TensorRT API - to build an engine. This can be time consuming, so you can set ``TVM_TENSORRT_CACHE_DIR`` to - point to a directory to save these built engines to on the disk. The next time you load the model - and give it the same directory, the runtime will load the already built engines to avoid the long - warmup time. A unique directory is required for each model. -* TensorRT has a paramter to configure the maximum amount of scratch space that each layer in the - model can use. It is generally best to use the highest value which does not cause you to run out - of memory. You can use ``TVM_TENSORRT_MAX_WORKSPACE_SIZE`` to override this by specifying the - workspace size in bytes you would like to use. -* For models which contain a dynamic batch dimension, the varaible ``TVM_TENSORRT_MULTI_ENGINE`` - can be used to determine how TensorRT engines will be created at runtime. The default mode, - ``TVM_TENSORRT_MULTI_ENGINE=0``, will maintain only one engine in memory at a time. If an input - is encountered with a higher batch size, the engine will be rebuilt with the new max_batch_size - setting. That engine will be compatible with all batch sizes from 1 to max_batch_size. This mode - reduces the amount of memory used at runtime. The second mode, ``TVM_TENSORRT_MULTI_ENGINE=1`` - will build a unique TensorRT engine which is optimized for each batch size that is encountered. - This will give greater performance, but will consume more memory. - - -Operator support ----------------- -+------------------------+------------------------------------+ -| Relay Node | Remarks | -+========================+====================================+ -| nn.relu | | -+------------------------+------------------------------------+ -| sigmoid | | -+------------------------+------------------------------------+ -| tanh | | -+------------------------+------------------------------------+ -| nn.batch_norm | | -+------------------------+------------------------------------+ -| nn.layer_norm | | -+------------------------+------------------------------------+ -| nn.softmax | | -+------------------------+------------------------------------+ -| nn.conv1d | | -+------------------------+------------------------------------+ -| nn.conv2d | | -+------------------------+------------------------------------+ -| nn.dense | | -+------------------------+------------------------------------+ -| nn.bias_add | | -+------------------------+------------------------------------+ -| add | | -+------------------------+------------------------------------+ -| subtract | | -+------------------------+------------------------------------+ -| multiply | | -+------------------------+------------------------------------+ -| divide | | -+------------------------+------------------------------------+ -| power | | -+------------------------+------------------------------------+ -| maximum | | -+------------------------+------------------------------------+ -| minimum | | -+------------------------+------------------------------------+ -| nn.max_pool2d | | -+------------------------+------------------------------------+ -| nn.avg_pool2d | | -+------------------------+------------------------------------+ -| nn.global_max_pool2d | | -+------------------------+------------------------------------+ -| nn.global_avg_pool2d | | -+------------------------+------------------------------------+ -| exp | | -+------------------------+------------------------------------+ -| log | | -+------------------------+------------------------------------+ -| sqrt | | -+------------------------+------------------------------------+ -| abs | | -+------------------------+------------------------------------+ -| negative | | -+------------------------+------------------------------------+ -| nn.batch_flatten | | -+------------------------+------------------------------------+ -| expand_dims | | -+------------------------+------------------------------------+ -| squeeze | | -+------------------------+------------------------------------+ -| concatenate | | -+------------------------+------------------------------------+ -| nn.conv2d_transpose | | -+------------------------+------------------------------------+ -| transpose | | -+------------------------+------------------------------------+ -| layout_transform | | -+------------------------+------------------------------------+ -| reshape | | -+------------------------+------------------------------------+ -| nn.pad | | -+------------------------+------------------------------------+ -| sum | | -+------------------------+------------------------------------+ -| prod | | -+------------------------+------------------------------------+ -| max | | -+------------------------+------------------------------------+ -| min | | -+------------------------+------------------------------------+ -| mean | | -+------------------------+------------------------------------+ -| nn.adaptive_max_pool2d | | -+------------------------+------------------------------------+ -| nn.adaptive_avg_pool2d | | -+------------------------+------------------------------------+ -| nn.batch_matmul | | -+------------------------+------------------------------------+ -| clip | Requires TensorRT 5.1.5 or greater | -+------------------------+------------------------------------+ -| nn.leaky_relu | Requires TensorRT 5.1.5 or greater | -+------------------------+------------------------------------+ -| sin | Requires TensorRT 5.1.5 or greater | -+------------------------+------------------------------------+ -| cos | Requires TensorRT 5.1.5 or greater | -+------------------------+------------------------------------+ -| atan | Requires TensorRT 5.1.5 or greater | -+------------------------+------------------------------------+ -| ceil | Requires TensorRT 5.1.5 or greater | -+------------------------+------------------------------------+ -| floor | Requires TensorRT 5.1.5 or greater | -+------------------------+------------------------------------+ -| split | Requires TensorRT 5.1.5 or greater | -+------------------------+------------------------------------+ -| strided_slice | Requires TensorRT 5.1.5 or greater | -+------------------------+------------------------------------+ -| nn.conv3d | Requires TensorRT 6.0.1 or greater | -+------------------------+------------------------------------+ -| nn.max_pool3d | Requires TensorRT 6.0.1 or greater | -+------------------------+------------------------------------+ -| nn.avg_pool3d | Requires TensorRT 6.0.1 or greater | -+------------------------+------------------------------------+ -| nn.conv3d_transpose | Requires TensorRT 6.0.1 or greater | -+------------------------+------------------------------------+ -| erf | Requires TensorRT 7.0.0 or greater | -+------------------------+------------------------------------+ - - -Adding a new operator ---------------------- -To add support for a new operator, there are a series of files we need to make changes to: - -* `src/runtime/contrib/tensorrt/tensorrt_ops.cc` Create a new op converter class which - implements the ``TensorRTOpConverter`` interface. You must implement the constructor to specify how - many inputs there are and whether they are tensors or weights. You must also implement the - ``Convert`` method to perform the conversion. This is done by using the inputs, attributes, and - network from params to add the new TensorRT layers and push the layer outputs. You can use the - existing converters as an example. Finally, register your new op conventer in the - ``GetOpConverters()`` map. -* `python/relay/op/contrib/tensorrt.py` This file contains the annotation rules for TensorRT. These - determine which operators and their attributes that are supported. You must register an annotation - function for the relay operator and specify which attributes are supported by your converter, by - checking the attributes are returning true or false. -* `tests/python/contrib/test_tensorrt.py` Add unit tests for the given operator. diff --git a/docs/how_to/deploy/vitis_ai.rst b/docs/how_to/deploy/vitis_ai.rst deleted file mode 100755 index 101be905ad59..000000000000 --- a/docs/how_to/deploy/vitis_ai.rst +++ /dev/null @@ -1,480 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - - -Vitis AI Integration -==================== - -`Vitis AI `__ is Xilinx's -development stack for hardware-accelerated AI inference on Xilinx -platforms, including both edge devices and Alveo cards. It consists of -optimized IP, tools, libraries, models, and example designs. It is -designed with high efficiency and ease of use in mind, unleashing the -full potential of AI acceleration on Xilinx FPGA and ACAP. - -The current Vitis AI flow inside TVM enables acceleration of Neural -Network model inference on edge and cloud with the `Zynq Ultrascale+ -MPSoc `__, -`Alveo `__ -and `Versal `__ platforms. -The identifiers for the supported edge and cloud Deep Learning Processor Units (DPU's) are: - -+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ -| **Target Board** | **DPU ID** | **TVM Target ID** | -+=========================================================================================+=======================+============================+ -| `ZCU104 `__ | DPUCZDX8G | DPUCZDX8G-zcu104 | -+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ -| `ZCU102 `__ | DPUCZDX8G | DPUCZDX8G-zcu102 | -+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ -| `Kria KV260 `__ | DPUCZDX8G | DPUCZDX8G-kv260 | -+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ -| `VCK190 `__ | DPUCVDX8G | DPUCVDX8G | -+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ -| `VCK5000 `__ | DPUCVDX8H | DPUCVDX8H | -+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ -| `U200 `__ | DPUCADF8H | DPUCADF8H | -+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ -| `U250 `__ | DPUCADF8H | DPUCADF8H | -+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ -| `U50 `__ | DPUCAHX8H / DPUCAHX8L | DPUCAHX8H-u50 / DPUCAHX8L | -+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ -| `U280 `__ | DPUCAHX8H / DPUCAHX8L | DPUCAHX8H-u280 / DPUCAHX8L | -+-----------------------------------------------------------------------------------------+-----------------------+----------------------------+ - -For more information about the DPU identifiers see following table: - -+-------------------+-------------+--------------------------------+------------------------+------------------------+------------------------+ -| DPU | Application | HW Platform | Quantization Method | Quantization Bitwidth | Design Target | -+===================+=============+================================+========================+========================+========================+ -| | Deep Learning | | C: CNN | | AD: Alveo DDR | | X: DECENT | | 4: 4-bit | | G: General purpose | -| | Processing Unit | | R: RNN | | AH: Alveo HBM | | I: Integer threshold | | 8: 8-bit | | H: High throughput | -| | | | VD: Versal DDR with AIE & PL | | F: Float threshold | | 16: 16-bit | | L: Low latency | -| | | | ZD: Zynq DDR | | R: RNN | | M: Mixed Precision | | C: Cost optimized | -+-------------------+-------------+--------------------------------+------------------------+------------------------+------------------------+ - -On this page you will find information on how to `setup <#setup-instructions>`__ TVM with Vitis AI -on different platforms (Zynq, Alveo, Versal) and on how to get started with `Compiling a Model <#compiling-a-model>`__ -and executing on different platforms: `Inference <#inference>`__. - -System Requirements -------------------- - -The `Vitis AI System Requirements page `__ -lists the system requirements for running docker containers as well as doing executing on Alveo cards. -For edge devices (e.g. Zynq), deploying models requires a host machine for compiling models using the TVM with Vitis AI flow, -and an edge device for running the compiled models. The host system requirements are the same as specified in the link above. - -Setup instructions ------------------- - -This section provide the instructions for setting up the TVM with Vitis AI flow for both cloud and edge. -TVM with Vitis AI support is provided through a docker container. The provided scripts and Dockerfile -compiles TVM and Vitis AI into a single image. - -1. Clone TVM repo - - .. code:: bash - - git clone --recursive https://github.com/apache/tvm.git - cd tvm - -2. Build and start the TVM - Vitis AI docker container. - - .. code:: bash - - ./docker/build.sh demo_vitis_ai bash - ./docker/bash.sh tvm.demo_vitis_ai - - # Setup inside container - conda activate vitis-ai-tensorflow - -3. Build TVM inside the container with Vitis AI (inside tvm directory) - - .. code:: bash - - mkdir build - cp cmake/config.cmake build - cd build - echo set\(USE_LLVM ON\) >> config.cmake - echo set\(USE_VITIS_AI ON\) >> config.cmake - cmake .. - make -j$(nproc) - -4. Install TVM - - .. code:: bash - - cd ../python - pip3 install -e . --user - -Inside this docker container you can now compile models for both cloud and edge targets. -To run on cloud Alveo or Versal VCK5000 cards inside the docker container, please follow the -`Alveo <#alveo-setup>`__ respectively `Versal VCK5000 <#versal-vck5000-setup>`__ setup instructions. -To setup your Zynq or Versal VCK190 evaluation board for inference, please follow -the `Zynq <#zynq-setup>`__ respectively `Versal VCK190 <#versal-vck190-setup>`__ instructions. - -Alveo Setup -~~~~~~~~~~~ - -Check out following page for setup information: `Alveo Setup `__. - -After setup, you can select the right DPU inside the docker container in the following way: - -.. code:: bash - - cd /workspace - git clone --branch v1.4 --single-branch --recursive https://github.com/Xilinx/Vitis-AI.git - cd Vitis-AI/setup/alveo - source setup.sh [DPU-IDENTIFIER] - -The DPU identifier for this can be found in the second column of the DPU Targets table at the top of this page. - -Versal VCK5000 Setup -~~~~~~~~~~~~~~~~~~~~ - -Check out following page for setup information: `VCK5000 Setup `__. - -After setup, you can select the right DPU inside the docker container in the following way: - -.. code:: bash - - cd /workspace - git clone --branch v1.4 --single-branch --recursive https://github.com/Xilinx/Vitis-AI.git - cd Vitis-AI/setup/vck5000 - source setup.sh - -Zynq Setup -~~~~~~~~~~ - -For the Zynq target (DPUCZDX8G) the compilation stage will run inside the docker on a host machine. -This doesn't require any specific setup except for building the TVM - Vitis AI docker. For executing the model, -the Zynq board will first have to be set up and more information on that can be found here. - -1. Download the Petalinux image for your target: - - `ZCU104 `__ - - `ZCU102 `__ - - `Kria KV260 `__ -2. Use Etcher software to burn the image file onto the SD card. -3. Insert the SD card with the image into the destination board. -4. Plug in the power and boot the board using the serial port to operate on the system. -5. Set up the IP information of the board using the serial port. For more details on step 1 to 5, please refer to `Setting Up The Evaluation Board `__. -6. Create 4GB of swap space on the board - -.. code:: bash - - fallocate -l 4G /swapfile - chmod 600 /swapfile - mkswap /swapfile - swapon /swapfile - echo "/swapfile swap swap defaults 0 0" >> /etc/fstab - -7. Install hdf5 dependency (will take between 30 min and 1 hour to finish) - -.. code:: bash - - cd /tmp && \ - wget https://support.hdfgroup.org/ftp/HDF5/releases/hdf5-1.10/hdf5-1.10.7/src/hdf5-1.10.7.tar.gz && \ - tar -zxvf hdf5-1.10.7.tar.gz && \ - cd hdf5-1.10.7 && \ - ./configure --prefix=/usr && \ - make -j$(nproc) && \ - make install && \ - cd /tmp && rm -rf hdf5-1.10.7* - -8. Install Python dependencies - -.. code:: bash - - pip3 install Cython==0.29.23 h5py==2.10.0 pillow - -9. Install PyXIR - -.. code:: bash - - git clone --recursive --branch rel-v0.3.1 --single-branch https://github.com/Xilinx/pyxir.git - cd pyxir - sudo python3 setup.py install --use_vart_edge_dpu - -10. Build and install TVM with Vitis AI - -.. code:: bash - - git clone --recursive https://github.com/apache/tvm - cd tvm - mkdir build - cp cmake/config.cmake build - cd build - echo set\(USE_LLVM OFF\) >> config.cmake - echo set\(USE_VITIS_AI ON\) >> config.cmake - cmake .. - make tvm_runtime -j$(nproc) - cd ../python - pip3 install --no-deps -e . - -11. Check whether the setup was successful in the Python shell: - -.. code:: bash - - python3 -c 'import pyxir; import tvm' - -.. note:: - - You might see a warning about the 'cpu-tf' runtime not being found. This warning is - expected on the board and can be ignored. - - -Versal VCK190 Setup -~~~~~~~~~~~~~~~~~~~ - -For the Versal VCK190 setup, please follow the instructions for `Zynq Setup <#zynq-setup>`__, -but now use the `VCK190 image `__ -in step 1. The other steps are the same. - - -Compiling a Model ------------------ - -The TVM with Vitis AI flow contains two stages: Compilation and Inference. -During the compilation a user can choose a model to compile for the cloud or -edge target devices that are currently supported. Once a model is compiled, -the generated files can be used to run the model on a the specified target -device during the `Inference <#inference>`__ stage. Currently, the TVM with -Vitis AI flow supported a selected number of Xilinx data center and edge devices. - -In this section we walk through the typical flow for compiling models with Vitis AI -inside TVM. - -**Imports** - -Make sure to import PyXIR and the DPU target (``import pyxir.contrib.target.DPUCADF8H`` for DPUCADF8H): - -.. code:: python - - import pyxir - import pyxir.contrib.target.DPUCADF8H - - import tvm - import tvm.relay as relay - from tvm.contrib.target import vitis_ai - from tvm.contrib import utils, graph_executor - from tvm.relay.op.contrib.vitis_ai import partition_for_vitis_ai - -**Declare the Target** - -.. code:: python - - tvm_target = 'llvm' - dpu_target = 'DPUCADF8H' # options: 'DPUCADF8H', 'DPUCAHX8H-u50', 'DPUCAHX8H-u280', 'DPUCAHX8L', 'DPUCVDX8H', 'DPUCZDX8G-zcu104', 'DPUCZDX8G-zcu102', 'DPUCZDX8G-kv260' - -The TVM with Vitis AI flow currently supports the DPU targets listed in -the table at the top of this page. Once the appropriate targets are defined, -we invoke the TVM compiler to build the graph for the specified target. - -**Import the Model** - -Example code to import an MXNet model: - -.. code:: python - - mod, params = relay.frontend.from_mxnet(block, input_shape) - - -**Partition the Model** - -After importing the model, we utilize the Relay API to annotate the Relay expression for the provided DPU target and partition the graph. - -.. code:: python - - mod = partition_for_vitis_ai(mod, params, dpu=dpu_target) - - -**Build the Model** - -The partitioned model is passed to the TVM compiler to generate the runtime libraries for the TVM Runtime. - -.. code:: python - - export_rt_mod_file = os.path.join(os.getcwd(), 'vitis_ai.rtmod') - build_options = { - 'dpu': dpu_target, - 'export_runtime_module': export_rt_mod_file - } - with tvm.transform.PassContext(opt_level=3, config={'relay.ext.vitis_ai.options': build_options}): - lib = relay.build(mod, tvm_target, params=params) - -**Quantize the Model** - -Usually, to be able to accelerate inference of Neural Network models -with Vitis AI DPU accelerators, those models need to quantized upfront. -In TVM - Vitis AI flow, we make use of on-the-fly quantization to remove -this additional preprocessing step. In this flow, one doesn't need to -quantize his/her model upfront but can make use of the typical inference -execution calls (module.run) to quantize the model on-the-fly using the -first N inputs that are provided (see more information below). This will -set up and calibrate the Vitis-AI DPU and from that point onwards -inference will be accelerated for all next inputs. Note that the edge -flow deviates slightly from the explained flow in that inference won't -be accelerated after the first N inputs but the model will have been -quantized and compiled and can be moved to the edge device for -deployment. Please check out the `Running on Zynq <#running-on-zynq>`__ -section below for more information. - -.. code:: python - - module = graph_executor.GraphModule(lib["default"](tvm.cpu())) - - # First N (default = 128) inputs are used for quantization calibration and will - # be executed on the CPU - # This config can be changed by setting the 'PX_QUANT_SIZE' (e.g. export PX_QUANT_SIZE=64) - for i in range(128): - module.set_input(input_name, inputs[i]) - module.run() - -By default, the number of images used for quantization is set to 128. -You could change the number of images used for On-The-Fly Quantization -with the PX_QUANT_SIZE environment variable. For example, execute the -following line in the terminal before calling the compilation script -to reduce the quantization calibration dataset to eight images. -This can be used for quick testing. - -.. code:: bash - - export PX_QUANT_SIZE=8 - -Lastly, we store the compiled output from the TVM compiler on disk for -running the model on the target device. This happens as follows for -cloud DPU's (Alveo, VCK5000): - -.. code:: python - - lib_path = "deploy_lib.so" - lib.export_library(lib_path) - - -For edge targets (Zynq, VCK190) we have to rebuild for aarch64. To do this -we first have to normally export the module to also serialize the Vitis AI -runtime module (vitis_ai.rtmod). We will load this runtime module again -afterwards to rebuild and export for aarch64. - -.. code:: python - - temp = utils.tempdir() - lib.export_library(temp.relpath("tvm_lib.so")) - - # Build and export lib for aarch64 target - tvm_target = tvm.target.arm_cpu('ultra96') - lib_kwargs = { - 'fcompile': contrib.cc.create_shared, - 'cc': "/usr/aarch64-linux-gnu/bin/ld" - } - - build_options = { - 'load_runtime_module': export_rt_mod_file - } - with tvm.transform.PassContext(opt_level=3, config={'relay.ext.vitis_ai.options': build_options}): - lib_edge = relay.build(mod, tvm_target, params=params) - - lib_edge.export_library('deploy_lib_edge.so', **lib_kwargs) - - -This concludes the tutorial to compile a model using TVM with Vitis AI. -For instructions on how to run a compiled model please refer to the next section. - -Inference ---------- - -The TVM with Vitis AI flow contains two stages: Compilation and Inference. -During the compilation a user can choose to compile a model for any of the -target devices that are currently supported. Once a model is compiled, the -generated files can be used to run the model on a target device during the -Inference stage. - -Check out the `Running on Alveo and VCK5000 <#running-on-alveo-and-vck5000>`__ -and `Running on Zynq and VCK190 <#running-on-zynq-and-vck190>`__ sections for -doing inference on cloud accelerator cards respectively edge boards. - -Running on Alveo and VCK5000 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -After having followed the steps in the `Compiling a Model <#compiling-a-model>`__ -section, you can continue running on new inputs inside the docker for accelerated -inference: - -.. code:: python - - module.set_input(input_name, inputs[i]) - module.run() - -Alternatively, you can load the exported runtime module (the deploy_lib.so -exported in `Compiling a Model <#compiling-a-model>`__): - -.. code:: python - - import pyxir - import tvm - from tvm.contrib import graph_executor - - dev = tvm.cpu() - - # input_name = ... - # input_data = ... - - # load the module into memory - lib = tvm.runtime.load_module("deploy_lib.so") - - module = graph_executor.GraphModule(lib["default"](dev)) - module.set_input(input_name, input_data) - module.run() - -Running on Zynq and VCK190 -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Before proceeding, please follow the `Zynq <#zynq-setup>`__ or -`Versal VCK190 <#versal-vck190-setup>`__ setup instructions. - -Prior to running a model on the board, you need to compile the model for -your target evaluation board and transfer the compiled model on to the board. -Please refer to the `Compiling a Model <#compiling-a-model>`__ section for -information on how to compile a model. - -Afterwards, you will have to transfer the compiled model (deploy_lib_edge.so) -to the evaluation board. Then, on the board you can use the typical -"load_module" and "module.run" APIs to execute. For this, please make sure to -run the script as root (execute ``su`` in terminal to log into root). - -.. note:: - - Note also that you **shouldn't** import the - PyXIR DPU targets in the run script (``import pyxir.contrib.target.DPUCZDX8G``). - -.. code:: python - - import pyxir - import tvm - from tvm.contrib import graph_executor - - dev = tvm.cpu() - - # input_name = ... - # input_data = ... - - # load the module into memory - lib = tvm.runtime.load_module("deploy_lib_edge.so") - - module = graph_executor.GraphModule(lib["default"](dev)) - module.set_input(input_name, input_data) - module.run() diff --git a/docs/how_to/legacy_index.rst b/docs/how_to/legacy_index.rst deleted file mode 100644 index 91ed4639db47..000000000000 --- a/docs/how_to/legacy_index.rst +++ /dev/null @@ -1,36 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -How To Guides -============= - -These user-focused "how to" guides are designed to help you find answers to -specific questions, like "How do I compile a model?" or "How to I optimize a -schedule with tesor expressions?" - -.. toctree:: - :maxdepth: 1 - - compile_models/index - deploy/index - work_with_relay/index - work_with_schedules/index - optimize_operators/index - tune_with_autotvm/index - tune_with_autoscheduler/index - extend_tvm/index - profile/index diff --git a/docs/how_to/profile/index.rst b/docs/how_to/profile/index.rst deleted file mode 100644 index 33642cf469e7..000000000000 --- a/docs/how_to/profile/index.rst +++ /dev/null @@ -1,24 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Profile Models -============== - -.. toctree:: - :maxdepth: 1 - - papi diff --git a/docs/how_to/profile/papi.rst b/docs/how_to/profile/papi.rst deleted file mode 100644 index 91599c9a7c6d..000000000000 --- a/docs/how_to/profile/papi.rst +++ /dev/null @@ -1,121 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - - -Getting Started With PAPI -========================= - -The Performance Application Programming Interface (PAPI) is a library that -provides performance counters on a variety of platforms. Performance counters -provide accurate low-level information about processors behavior during a given -execution run. This information can contain simple metrics like total cycle -count, cache misses, and instructions executed as well as more high level -information like total FLOPS and warp occupancy. PAPI makes these metrics -available while profiling. - -Installing PAPI ---------------- - -PAPI can either be installed using your package manager (``apt-get install libpapi-dev`` -on Ubuntu), or from source here: -https://github.com/icl-utk-edu/papi. - -Pulling the latest version of PAPI from source has caused build issues before. Therefore, it is recommended to checkout tagged version ``papi-6-0-0-1-t``. - -Building TVM With PAPI ----------------------- - -To include PAPI in your build of TVM, set the following line in you ``config.cmake``: - -.. code:: - - set(USE_PAPI ON) - -If PAPI is installed in a non-standard place, you can specify where it is like so: - -.. code:: - - set(USE_PAPI path/to/papi.pc) - - -Using PAPI While Profiling --------------------------- - -If TVM has been built with PAPI (see above), then you can pass a -:py:class:`tvm.runtime.profiling.PAPIMetricCollector` to -:py:meth:`tvm.runtime.GraphModule.profile` to collect performance metrics. Here -is an example: - -.. code:: python - - import tvm - from tvm import relay - from tvm.relay.testing import mlp - from tvm.runtime import profiler_vm - import numpy as np - - target = "llvm" - dev = tvm.cpu() - mod, params = mlp.get_workload(1) - - exe = relay.vm.compile(mod, target, params=params) - vm = profiler_vm.VirtualMachineProfiler(exe, dev) - - data = tvm.nd.array(np.random.rand(1, 1, 28, 28).astype("float32"), device=dev) - report = vm.profile( - data, - func_name="main", - collectors=[tvm.runtime.profiling.PAPIMetricCollector()], - ) - print(report) - -.. code:: - - Name perf::CACHE-MISSES perf::CYCLES perf::STALLED-CYCLES-BACKEND perf::INSTRUCTIONS perf::STALLED-CYCLES-FRONTEND - fused_nn_dense_nn_bias_add_nn_relu 2,494 1,570,698 85,608 675,564 39,583 - fused_nn_dense_nn_bias_add_nn_relu_1 1,149 655,101 13,278 202,297 21,380 - fused_nn_dense_nn_bias_add 288 600,184 8,321 163,446 19,513 - fused_nn_batch_flatten 301 587,049 4,636 158,636 18,565 - fused_nn_softmax 154 575,143 8,018 160,738 18,995 - ---------- - Sum 4,386 3,988,175 119,861 1,360,681 118,036 - Total 10,644 8,327,360 179,310 2,660,569 270,044 - -You can also change which metrics are collected: - -.. code:: python - - report = vm.profile( - data, - func_name="main", - collectors=[tvm.runtime.profiling.PAPIMetricCollector({dev: ["PAPI_FP_OPS"]})], - ) - -.. code:: - - Name PAPI_FP_OPS - fused_nn_dense_nn_bias_add_nn_relu 200,832 - fused_nn_dense_nn_bias_add_nn_relu_1 16,448 - fused_nn_dense_nn_bias_add 1,548 - fused_nn_softmax 160 - fused_nn_batch_flatten 0 - ---------- - Sum 218,988 - Total 218,988 - -You can find a list of available metrics by running the ``papi_avail`` and -``papi_native_avail`` commands. diff --git a/docs/index.rst b/docs/index.rst index 2eccb60caa23..05ca8c952bc3 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -65,16 +65,6 @@ driving its costs down. reference/api/python/index reference/api/links -.. toctree:: - :maxdepth: 1 - :caption: Legacy - - tutorial/index - how_to/legacy_index - dev/tutorial/index - dev/how_to/how_to.rst - reference/langref/index - .. toctree:: :maxdepth: 1 :caption: About diff --git a/docs/install/index.rst b/docs/install/index.rst index 6bc2da97e119..b09ddb35dd45 100644 --- a/docs/install/index.rst +++ b/docs/install/index.rst @@ -29,8 +29,7 @@ Installing TVM Visit the :ref:`install TVM from source ` page to install TVM from the source code. Installing from source gives you the maximum flexibility to configure the build effectively from the official source releases. If you are interested in deploying to mobile or embedded devices, you do not need to -install the entire TVM stack on your device. Instead, you only need the runtime and can install using the -:ref:`deployment and integration guide `. +install the entire TVM stack on your device. Instead, you only need the runtime. If you would like to quickly try out TVM or run some demo and tutorials, you can :ref:`install from Docker `. You can also use TVM locally through ``pip``. diff --git a/docs/reference/api/python/contrib.rst b/docs/reference/api/python/contrib.rst index 26b5abb97ffa..0eb3024c2d08 100644 --- a/docs/reference/api/python/contrib.rst +++ b/docs/reference/api/python/contrib.rst @@ -93,18 +93,6 @@ tvm.contrib.random :members: -tvm.contrib.relay_viz -~~~~~~~~~~~~~~~~~~~~~ -.. automodule:: tvm.contrib.relay_viz - :members: -.. automodule:: tvm.contrib.relay_viz.dot - :members: -.. automodule:: tvm.contrib.relay_viz.terminal - :members: -.. automodule:: tvm.contrib.relay_viz.interface - :members: - - tvm.contrib.rocblas ~~~~~~~~~~~~~~~~~~~ .. automodule:: tvm.contrib.rocblas diff --git a/docs/reference/langref/hybrid_script.rst b/docs/reference/langref/hybrid_script.rst deleted file mode 100644 index eeed07a038cc..000000000000 --- a/docs/reference/langref/hybrid_script.rst +++ /dev/null @@ -1,237 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _hybrid-langref-label: - -Hybrid Frontend Language Reference -================================== - -Overview --------- - -This hybrid frontend allows users to write preliminary versions of some idioms that yet have -been supported by TVM officially. - -Features --------- - -Software Emulation -~~~~~~~~~~~~~~~~~~ - -Both software emulation and compilation are supported. To define a function, -you need to use ``tvm.te.hybrid.script`` decorator to indicate this is a hybrid function: - -.. code-block:: python - - @tvm.te.hybrid.script - def outer_product(a, b): - c = output_tensor((100, 99), 'float32') - for i in range(a.shape[0]): - for j in range(b.shape[0]): - c[i, j] = a[i] * b[j] - return c - a = numpy.random.randn(100) - b = numpy.random.randn(99) - c = outer_product(a, b) - - -This decorator will import `Keywords`_ required spontaneously when software emulation. -After software emulation is done, the imported keywords will be cleaned up. Users do not need -worry about keyword conflict and pollution. - -Every element passed for software emulation in the argument list is either a python variable -or ``numpy`` numeric type. - -Backend Compilation -~~~~~~~~~~~~~~~~~~~ - -This function is not encouraged to use, users are encouraged to use the second interface. -The current parse interface looks like: - -.. code-block:: python - - a = tvm.te.placeholder((100, ), name='a') - b = tvm.te.placeholder((99, ), name='b') - parser = tvm.hybrid.parse(outer_product, [a, b]) # return the parser of this function - - -If we pass these tvm data structures, like ``Tensor``, ``Var``, ``Expr.*Imm``, -or ``tvm.container.Array``, to this function, it returns a op node: - -.. code-block:: python - - a = tvm.te.placeholder((100, ), name='a') - b = tvm.te.placeholder((99, ), name='b') - c = outer_product(a, b) # return the output tensor(s) of the operator - -You can use any methods that can be applied on a TVM ``OpNode``, like create_schedule, although -so far, the functionality of schedule is as limited as ``ExternOpNode``. At least, it can be built -to LLVM module. - -Tuning -~~~~~~ - -Follow up the example above, you can use some tvm like interfaces to tune the code: - -.. code-block:: python - - i, j = c.op.axis - sch = te.create_schedule(op) - jo, ji = sch.split(j, 4) - sch.vectorize(ji) - -For now, you can use loop annotations (``unroll``, ``parallel``, ``vectorize``, and ``bind``), -loop manipulation (``split`` and ``fuse``), and ``reorder``. - -.. note:: - - This is a preliminary function, so users should be in charge of the correctness - of the functionality after tuning. Specifically, users should be careful when - fusing and reorderding imperfect loops. - -Loops -~~~~~ - -In HalideIR, loops have in total 4 types: ``serial``, ``unrolled``, ``parallel``, and ``vectorized``. - -Here we use ``range`` aka ``serial``, ``unroll``, ``parallel``, and ``vectorize``, -these **4** keywords to annotate the corresponding types of for loops. -The usage is roughly the same as Python standard ``range``. - -Besides all the loop types supported in Halide, ``const_range`` is supported for some specific conditions. -Sometimes, ``tvm.container.Array`` is desired to pass as an argument, but in TVM-HalideIR, there is no -such support that converts ``tvm.container.Array`` to an ``Expr``. Thus, a limited feature is supported. -Users can access containers by either constants or constants loops annotated. - -.. code-block:: python - - @tvm.te.hybrid.script - def foo(a, b): # b is a tvm.container.Array - c = output_tensor(a.shape, a.dtype) - for i in const_range(len(a)): # because you have b access, i should be explicitly annotated as const_range - c[i] = a[i] + b[i] - return c - - -Variables -~~~~~~~~~ - -All the mutable variables will be lowered to an array with size 1. -It regards the first store of a variable as its declaration. - -.. note:: - - Unlike conventional Python, in hybrid script, the declared variable - can only be used in the scope level it is declared. - - -.. note:: - - Currently, you can ONLY use basic-typed variables, i.e. the type of the - variable should be either ``float32``, or ``int32``. - -.. code-block:: python - - for i in range(5): - s = 0 # declaration, this s will be a 1-array in lowered IR - for j in range(5): - s += a[i, j] # do something with s - b[i] = s # you can still use s in this level - a[0] = s # you CANNOT use s here, even though it is allowed in conventional Python - - -Attributes -~~~~~~~~~~ - -So far, ONLY tensors' ``shape`` and ``dtype`` attribute are supported! -The ``shape`` attribute is essentially a tuple, so you MUST access it as an array. -Currently, only constant-indexed access is supported. - -.. code-block:: python - - x = a.shape[2] # OK! - for i in range(3): - for j in a.shape[i]: # BAD! i is not a constant! - # do something - - -Conditional Statement and Expression -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code-block:: python - - if condition1 and condition2 and condition3: - # do something - else: - # do something else - # Select - a = b if condition else c - -However, NO ``True`` and ``False`` keyword supported yet. - - -Math Intrinsics -~~~~~~~~~~~~~~~ - -So far, these math intrinsics, ``log``, ``exp``, ``sigmoid``, -``tanh``, ``power``, and ``popcount``, are supported. -No import is required, just as it is mentioned in `Software Emulation`_, just use it! - -Array Allocation -~~~~~~~~~~~~~~~~ - -**Under construction, this function will be supported later!** - -Use a function call ``allocation(shape, type, share/local)`` to declare an array buffer. -The basic usage is roughly the same as a normal ``numpy.array``, and you should access -high-dim array in ``a[i, j, k]`` fashion instead of ``a[i][j][k]``, -even for ``tvm.container.Array`` for compilation. - - -Thread Bind -~~~~~~~~~~~ - - -You can also do loop-thread bind by writing code like this: - -.. code-block:: python - - for tx in bind("threadIdx.x", 100): - a[tx] = b[tx] - - -Assert Statement -~~~~~~~~~~~~~~~~ - -Assert statement is supported, you can simply use it as it is in standard Python. - -.. code-block:: python - - assert cond, mesg - -.. note:: - - ``Assert`` is NOT a function call. Users are encouraged to use assert in the way - presented above --- condition followed by message. It fits both Python AST and HalideIR. - -Keywords -~~~~~~~~ -- For keywords: ``serial``, ``range``, ``unroll``, ``parallel``, ``vectorize``, ``bind``, ``const_range`` -- Math keywords: ``log``, ``exp``, ``sqrt``, ``rsqrt``, ``sigmoid``, ``tanh``, ``power``, ``popcount``, ``round``, ``ceil_div`` -- Allocate keywords: ``allocate``, ``output_tensor`` -- Data type keywords: ``uint8``, ``uint16``, ``uint32``, ``uint64``, ``int8``, ``int16``, ``int32``, ``int64``, ``float16``, ``float32``, ``float64`` -- Others: ``max_num_threads`` diff --git a/docs/reference/langref/index.rst b/docs/reference/langref/index.rst deleted file mode 100644 index dcea9fa50c3d..000000000000 --- a/docs/reference/langref/index.rst +++ /dev/null @@ -1,61 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Language Reference -================== -This document provides references to -embedded languages and IRs in the TVM stack. - -Introduction to Relay ---------------------- - -Relay is a functional, differentiable programming language -designed to be an expressive intermediate representation for machine -learning systems. Relay supports algebraic data types, closures, -control flow, and recursion, allowing it to directly represent more -complex models than computation graph-based IRs can. -Relay also includes a form of dependent typing using *type relations* -in order to handle shape analysis for operators with complex -requirements on argument shapes. - -Relay is extensible by design and makes it easy for machine learning -researchers and practitioners to develop new large-scale program -transformations and optimizations. - -The below pages describe the grammar, type system, -algebraic data types, and operators in Relay, respectively. - -.. toctree:: - :maxdepth: 2 - - relay_expr - relay_type - relay_adt - relay_op - relay_pattern - -Hybrid Script -------------- - -The below page describes the TVM hybrid script front-end, -which uses software emulation to support some constructs not -officially supported in TVM. - -.. toctree:: - :maxdepth: 2 - - hybrid_script diff --git a/docs/reference/langref/relay_adt.rst b/docs/reference/langref/relay_adt.rst deleted file mode 100644 index dab2e3e70678..000000000000 --- a/docs/reference/langref/relay_adt.rst +++ /dev/null @@ -1,533 +0,0 @@ - -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _adt-overview: - -============================= -Algebraic Data Types in Relay -============================= - -Algebraic data types (ADTs) are a staple feature of functional programming languages, -particularly those derived from ML, because they express data structures in a -manner that is easy to reason about when writing recursive computations. -Because recursion is intended to be one of the primary mechanisms of control -flow in Relay, it is important that Relay include ADTs in order to best express -loops and other control flow structures that must be implemented using recursion. - -Defining and Matching on an ADT -=============================== - -*Note: ADTs are not presently supported in the text format. The syntax here is speculative, based on ADTs in other languages.* - -ADTs can be understood as a generalized version of :code:`enum` and :code:`struct` types -from C-like languages. Like a C :code:`struct:`, an ADT instance is a container for fields -of specified types, but the type system allows for the same type to encode different possible -groupings of fields in a systematic manner, similar to C :code:`enum` types, which are -defined using a finite set of possible values named by the user. - -Specifically, an ADT is defined as a named group of constructors, each of which is -a function that takes values of specified types as arguments and returns an instance -of the named ADT. An ADT instance simply contains the values of the arguments -passed to the constructor call used to produce it. - -An ADT value is opaque until it is *deconstructed*, allowing the arguments to the -constructor to be accessed again and used to compute new values. Because -a particular ADT can have multiple constructors with different signatures, -it is usually necessary to branch on the different possible constructors, -resulting in the *match* syntax for ADTs. Hence, ADTs are sometimes called -"tagged unions" because an ADT instance is tagged by the name of the constructor -used to produce it and can later be inspected based on the tag. - -Because each ADT has a finite set of constructors, it is straightforward to determine -whether a function processing an ADT instance is handling all possible cases. -In particular, the type system can ensure that types are properly assigned in all cases when -deconstructing an ADT instance, in contrast to :code:`union` types in C. -Hence, it is often easy to reason about ADTs. - -*Implementation detail: Relay ADT definitions are global and are stored in the module, similarly to global function definitions. An ADT name is, in fact, a global type variable (just as a global function name is a global variable). The module keeps a mapping of ADT names (global type variables) to the list of constructors for that ADT.* - -Below is a simple example of defining an ADT and using it in a function -via a match expression: - -.. code-block:: - - # Defines an ADT named "Numbers" - data Numbers { - Empty : () -> Numbers - Single : (Tensor[(), int32]) -> Numbers - Pair : (Tensor[(), int32], Tensor[(), int32]) -> Numbers - } - # A Numbers value can be produced using an Empty, Single, or Pair - # constructor, each with a signature given above - - def @sum(%n : Numbers[]) -> Tensor[(), int32] { - # The match expression branches on the constructor that was - # used to produce %n. The variables in each case are bound - # if the constructor matches that used for %n - match(%n) { - case Empty() { 0 } - case Single(x) { x } - case Pair(x, y) { x + y } - } - } - - @sum(Empty()) # evaluates to 0 - @sum(Single(3)) # evaluates to 3 - @sum(Pair(5, 6)) # evaluates to 11 - -Note that ADTs are identified by name, -meaning that two ADTs with structurally identical constructors -will nevertheless be distinct data types from the point of view of -the typechecker. - -.. code-block:: - - # structurally identical constructors to Numbers - data Numbers2 { - Empty2 : () -> Numbers2 - Single2 : (Tensor[(), int32]) -> Numbers2 - Pair2 : (Tensor[(), int32], Tensor[(), int32]) -> Numbers2 - } - - # the below results in a type error because Numbers2 - # is a distinct type from Numbers - # fn() { @sum(Empty2()) } - -Type-Checking ADTs and Polymorphism -=================================== - -This section will go into more specific detail about the typing of ADTs. -Most of the complexity involved results from the fact that, as with functions, ADTs -can be polymorphic and take type parameters. - -For example, one of the standard ADTs commonly used in functional -programming languages is the optional type, defined here: - -.. code-block:: - - # a is a type parameter - data Optional { - None : () -> Optional - Some : (a) -> Optional - } - -Optional types are commonly used as the return type for any operation -involving querying into a data structure (returning :code:`Some(v)` -if a value is found and :code:`None` if it isn't). -Taking a type parameter in the definition allows the same optional type -to be used in a wide variety of situations, rather than having to -define a unique ADT for each different type that could be contained in it. - -However, it is important to ensure that option types whose contents -are of different types can still be distinguished by the type system, -since it would violate type safety if a function expecting an option -containing a :code:`Tensor[(), int32]` instead receives an option -containing a :code:`Tensor[(3, 4), float32]`. As this example may -imply, an ADT instance is thus given a type that contains the -concrete type arguments for that instance, ensuring the information is -kept around. Let the below example illustrate: - -.. code-block:: - - # the signature for option indicates the type argument - def @inc_scalar(%opt : Optional[Tensor[(), int32]]) -> Tensor[(), int32] { - match(%opt) { - case None() { 1 } - case Some(%s) { %s + 1 } - } - } - - def @main() { - let %one : Optional[Tensor[(), int32]] = Some(1); - let %big : Optional[Tensor[(10, 10), float32]] - = Some(Constant(1, (10, 10), float32)); - let %two = inc_scalar(%one); - # let %bigger = inc_scalar(%big); # type system rejects - # None does not take an argument so it can always implicitly - # be given the correct type arguments - let %z = inc_scalar(None()); - () - } - -The syntax for the annotated type arguments -(e.g., :code:`Optional[Tensor[(), int32]]`) in the above examples is -called a "type call," treating the polymorphic ADT definition as a -type-level function (taking type params and returning a type, namely -the ADT). Any ADT appearing in a type annotation or function signature -must be annotated with type arguments (a non-polymorphic ADT must be -in a type call with no arguments). - -Thus, we can say in general that if constructor :code:`C` that -takes arguments of types :code:`T1, ..., Tn` is a constructor -for an ADT :code:`D` that takes type parameters :code:`v1, ..., vn` -(where :code:`T1, ..., Tn` may contain any of the :code:`v1, ..., vn`), -then :code:`C` has -the type :code:`fun(T1, ..., Tn) -> D[v1, ..., vn]`. -This means that constructors are typed like ordinary functions and -thus appear inside call nodes and can be passed to or returned by -other functions. In particular, the :code:`Some` example above has -the signature :code:`fun(a) -> Optional[a]`, while :code:`None` -has the signature :code:`fun() -> Optional[a]`. - -Recursion with ADTs -=================== - -ADT definitions are allowed to be recursive, that is, a definition for -an ADT named :code:`D` can assume the existence of type :code:`D` and -use it as an argument to constructors. Recursion allows ADTs to -represent complex structures such as lists or trees; it is the source -of much of ADTs' power in functional programming, since an appropriately -designed data structure could make it easy to concisely express a -computation with a recursive function. - -Many commonly used ADTs involve recursion; some of these are given -in `Common ADT Uses`_. As an example here, we will -examine the list ADT, ubiquitous in functional languages: - -.. code-block:: - - data List { - Nil : () -> List - Cons : (a, List[a]) -> List - } - -(Notice that the recursive reference to :code:`List` is wrapped -in a type call even in the constructor.) - -The above definition means that a list of values of a particular type -can be represented by nesting :code:`Cons` constructors until the -end of the list is reached, which can be indicated with a :code:`Nil` -(representing an empty list). - -Lists represented in this manner can easily be recursively processed. -For example, the following function sums a list of integers: - -.. code-block:: - - def @list_sum(%l : List[Tensor[(), int32]]) -> Tensor[(), int32] { - match(%l) { - case Nil() { 0 } - # add the head of the list to the sum of the tail - case Cons(%h, %t) { %h + @list_sum(%t) } - } - } - -As it happens, many recursive functions on lists like the one just given -share structures that can be factored out into generic, easily -usable functions that will be discussed under `Common ADT Uses`_. - -.. _adt-pattern: - -Pattern Matching in Match Expressions -===================================== - -Match expressions in Relay, as in other functional languages, are capable of -more versatile pattern matching than simply having one case for each constructor -for the datatype of the value being deconstructed. - -In particular, the patterns in match cases can be built up recursively: - -- Constructor patterns match for a particular ADT constructor. If a value matches the constructor, each argument to the constructor will be matched against a nested pattern. -- Wildcard patterns will match any value and will not bind to a variable. -- Variable patterns will match any value and bind it to a local variable, scoped to the match clause. - -In the simple case of :code:`@list_sum` above, the first match case has a :code:`Nil` constructor pattern (with no nested arguments) -and the second has a :code:`Cons` constructor pattern that uses variable patterns for each of the arguments to :code:`Cons`. - -The below example uses a wildcard pattern to ignore one of the arguments to :code:`Cons`: - -.. code-block:: - - def @first(%l : List[a]) -> Optional[a] { - match(%l) { - case Nil() { None() } - case Cons(%h, _) { Some(%h) } # list tail is unused and ignored - } - } - -Here, a constructor pattern is nested inside another constructor pattern to avoid nested match expressions for a list option. -A top-level wildcard pattern is also used to handle all cases that do not match the first clause: - -.. code-block:: - - def @second_opt(%ll : Optional[List[a]]) -> Optional[a] { - match(%ll) { - # we only need the second member of the list if there is one - case Some(Cons(_, Cons(%s, _))) { Some(%s) } - case _ { None() } - } - } - - # @second_opt(Some(Cons(1, Nil()))) evaluates to None() - # @second_opt(Some(Cons(1, Cons(2, Nil())))) evaluates to Some(2) - # @second_opt(Some(Nil())) evaluates to None() - # @second_opt(None()) evaluates to None() - -Note that a match expression checks its patterns in the order the cases are listed: the first clause whose pattern -that matches the input value is the one that is evaluated. Here, a top-level variable pattern binds the whole -input value: - -.. code-block:: - - def @match_order_beware(%l : List[a]) -> List[a] { - match(%l) { - case %v { %v } - # the above matches everything so neither of these runs - case Cons(%h, %t) { Cons(%h, @match_order_beware(%t)) } - case Nil() { Nil() } - } - } - -Common ADT Uses -=============== - -In functional programming languages, certain ADTs provide useful facilities for writing common programs. -Parametric polymorphism and higher-order functions allow these ADTs to be easily reuseable and for generic -functions to manipulate them in common situations. Relay includes a "Prelude" of certain pre-defined ADTs -and functions for them that correspond to the indispensable ADTs of other languages. - -The option type defined under `Type-Checking ADTs and Polymorphism`_ is one such ADT, used -whenever it can make sense for a function to only return a value under certain circumstances. Having -the option type allows for the type system to keep track of which functions always return a value -of a certain type versus returning an option of that type, ensuring that any options are always -explicitly checked (contrast with returning null pointers or throwing -exceptions as other ways to addressing that problem). - -Lists (defined in `Recursion with ADTs`_) can be manipulated by generic functions in a manner similar to -list comprehensions and certain library functions in Python. Below are very common functions for iterating -through lists, which are included in Relay's Prelude. (These have all been extensively characterized -in the functional programming literature, and we do not attempt to reproduce that work in this document.) - -.. code-block:: - - # Map: for [h1, h2, ..., hn] returns [f(h1), f(h2), ..., f(hn)] - def @map(%f : fn(a) -> b, %l : List[a]) -> List[b] { - match(%l) { - case Nil() { Nil() } - case Cons(%h, %t) { Cons(%f(%h), @map(%f, %t)) } - } - } - - # Left fold: for [h1, h2, ..., hn] returns f(...(f(f(z, h1), h2)...), hn) - def @foldl(%f : fn(b, a) -> b, %z : b, %l : List[a]) -> b { - match(%l) { - case Nil() { %z } - case Cons(%h, %t) { @foldl(%f, %f(%z, %h), %t) } - } - } - - # Right fold: for [h1, h2, ..., hn] returns f(h1, f(h2, f(..., (f(hn, z)...) - def @foldr(%f : fn(a, b) -> b, %z : b, %l : List[a] -> b { - match(%l) { - case Nil() { %z } - case Cons(%h, %t) { %f(%h, @foldr(%f, %z, %t)) } - } - } - -Using these iteration constructs, many common operations over lists can be expressed compactly. -For example, the following map doubles all members of a list: - -.. code-block:: - - # directly written - def @double(%l : List[Tensor[(), int32]]) -> List[Tensor[(), int32]] { - match(%l) { - case Nil() { Nil() } - case Cons(%h, %t) { Cons(%h * 2, @double(%t)) } - } - } - - # map takes care of the recursion - @map(fn(%i) { %i * 2 }, %l) - -The following right fold concatenates two lists: - -.. code-block:: - - # directly written - def @concat(%l1 : List[a], %l2 : List[a]) -> List[a] { - match(%l1) { - case Nil() { %l2 } - case Cons(%h, %t) { Cons(%h, @concat(%t, %l2) } - } - } - - # foldr takes care of the recursion - @foldr(fn(%h, %z) { Cons(%h, %z) }, %l2, %l1) - -The following left fold flattens a list of lists (using concatenation): - -.. code-block:: - - # directly written - def @flatten(%ll : List[List[a]]) -> List[a] { - match(%ll) { - case Cons(%h, %t) { @concat(%h, @flatten(%t)) } - case Nil() { Nil() } - } - - # foldl takes care of the recursion - @foldl(@concat, Nil(), %ll) - -Note that these iteration constructs can be implemented directly in Relay's -source language and more can easily be defined (and for more data types, like trees), -rather than being constructs built into the language (e.g., -`"foreach" in MXNet `__). -ADTs and their extensibility allow for a broad range of iterations and data structures to be expressed -in Relay and supported by the type system without having to modify the language implementation. - -Implementing Neural Nets Using ADTs -=================================== - -In `this 2015 blog post `__, Christopher Olah notes that -many neural networks can be easily expressed using common functional programming constructs. Relay's ADTs -allow those examples to be implemented directly in TVM. - -First let us suppose that we have a function corresponding to a trained recurrent neural net (RNN) -cell, which takes in a past state and an input value and returns a new state and output value. In -Relay, this would have the following signature: - -.. code-block:: - - @cell : fn(state_type, in_type) -> (state_type, out_type) - -We might consider a ReLU cell as a simple concrete example, with a trained version below: - -.. code-block:: - - def @linear(%x, %w, %b) { %w*%x + %b } - - def @relu_cell(%w, # weights - %b, # offsets - %s, # state - %x # input - ) { - let %x2 = @linear(%x, %w.0, %b.0); - let %s2 = @linear(%s, %w.1, %b.1); - # doesn't change the state - (%s, nn.relu(%x2 + %s2)) - } - - # this is a higher-order function because it returns a closure - def @trained_cell(%w, %b) { - fn(%x, %h) { @relu_cell(%w, %b, %x, %h) } - } - -Following Olah's example, we can encode a sequence (list) of inputs with the following left fold: - -.. code-block:: - - def @encode(%cell, %input : List[in_type], %init : state_type) -> state_type { - # not using the output - @foldl(fn(%state, %in) { %cell(%state, %in).0 }, %init, %input) - } - -Using an *unfold* iterator (from Haskell's standard library), the same cell could be used to make -a generator network (which takes a single input and produces a sequence of outputs): - -.. code-block:: - - # included in Relay's Prelude - def @unfoldr(%f : fn(b) -> Optional[(a, b)], %z : b) -> List[a] { - match(%f(%z)) { - case Some(%pair) { Cons(%pair.0, @unfoldr(%f, %pair.1)) } - case None() { Nil() } - } - } - - # we need some way of generating an input to the cell function given only a state - def @gen_func(%state : state_type) : Optional[(out_type, state_type)] { - let %in : Optional[in_type] = @generate_input(%state); - match(%in) { - case Some(%n) { - let %cell_out = @cell(%n, %state); - Some((%cell_out.1, %cell_out.0)) # pair of output and state - } - case None() { None() } - } - } - - def @generator(%cell, %init : state_type) -> List[out_type] { - @unfoldr(fn(%state) { @gen_func(%cell, %state) }, %init) - } - -An accumulating map (a fold that simultaneously updates an accumulator value and a list -of outputs) can be used to write a general RNN (with an output for every input): - -.. code-block:: - - def @map_accumr(%f : fn(a, b) -> (a, c), %acc : a, %l : List[b]) -> (a, List[c]) { - match(%l) { - case Nil() { (%acc, Nil()) } - case Cons(%b, %t) { - let %update = %f(%acc, %b); - let %rest = @map_accumr(%f, %update.0, %t)); - (%rest.0, Cons(%update.1, %rest.1)) - } - } - } - - # can also be implemented as a right fold - # (this version is included in Relay's Prelude) - def @map_accumr_fold(%f, %acc, %l) { - @foldr(fn(%b, %p) { - let %f_out = %f(%p.0, %b); - (%f_out.0, Cons(%f_out.1, %p.1)) - }, - (%acc, Nil()), %l) - } - - def @general_rnn(%cell, %init : state_type, %input : List[in_type]) - -> (state_type, List[out_type]) { - @map_accumr(%cell, %init, %input) - } - -Olah also gives an example of a bidirectional neural network, in which two sets of -cells (which may have different weights) process the input in both directions and produce a -single set of outputs. The following is a Relay implementation of that example: - -.. code-block:: - - # creates a list of tuples from two lists - # included in Relay's Prelude - def @zip(%l : List[a], %m : List[b]) -> List[(a, b)] { - match(%l) { - case Nil() { Nil() } - case Cons(%a, %t1) { - match(%m) { - case Nil() { Nil() } - case Cons(%b, %t2) { Cons((%a, %b), @zip(%t1, %t2)) } - } - } - } - } - - # analogous to map_accumr - # included in Relay's Prelude - def @map_accmul(%f, %acc, %l) { - @foldl(fn(%p, %b){ - let %f_out = %f(%p.0, %b); - (%f_out.0, Cons(%f_out.1, %p.1)) - }, (%acc, Nil()), %l) - } - - def @bidirectional_rnn - (%cell1, %cell2, %state1 : state1_type, %state2 : state2_type, %input : List[in_type]) - -> List[(out1_type, out2_type)] { - @zip(@map_accumr(%cell1, %state1, %input).1, @map_accuml(%cell2, %state2, %input).1) - } diff --git a/docs/reference/langref/relay_expr.rst b/docs/reference/langref/relay_expr.rst deleted file mode 100644 index c789331efe63..000000000000 --- a/docs/reference/langref/relay_expr.rst +++ /dev/null @@ -1,691 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -==================== -Expressions in Relay -==================== - -The Relay IR is a pure, expression-oriented language. The below sections -describe the different expressions in Relay and give details of their semantics. - -Dataflow and Control Fragments -============================== - -For the purposes of comparing Relay to traditional computational graph-based IRs, it -can be useful to consider Relay expressions in terms of dataflow and control fragments. -Each portion of a Relay program containing expressions that only affect the dataflow can -be viewed as a traditional computation graph when writing and expressing transformations. - -The dataflow fragment covers the set of Relay expressions that do not involve -control flow. That is, any portion of a program containing only the following -constructs corresponds to a pure computation graph: - -- `Variables`_ -- Tuple `Construction`_ and `Projection`_ -- `Let Bindings`_ -- `Graph Bindings`_ -- Calls to `Operators`_ and `ADT Constructors`_ - -Control flow expressions allow the graph topology to change -based on the value of previously executed expressions. The control -fragment in Relay includes the following constructs: - -- `If-Then-Else`_ Expressions -- `ADT Matching`_ Expressions -- Recursive Calls in Functions - -From the point of view of a computation graph, a function is a subgraph and a function call inlines the subgraph, substituting its arguments for the free variables in the subgraph with corresponding names. -Thus, if a function's body uses only dataflow constructs, -a call to that function is in the dataflow fragment; conversely, if the -function's body contains control flow, a call to that function is not part of the dataflow fragment. - -Variables -========= - -Inspired by LLVM, Relay explicitly distinguishes between local and -global variables both in the AST and in the text format. In the text format, -global and local variables are distinguished by prefixes, or *sigils*. -Global variables are prefixed with :code:`@` and local variables with :code:`%`. - -This explicit distinction makes certain optimizations easier to implement. -For example, inlining a global definition requires no analysis: simply -substituting the definition suffices. - -Global Variable -~~~~~~~~~~~~~~~~~~ - -Global identifiers are prefixed by the :code:`@` sigil, such as ":code:`@global`". -A global identifier always references a globally visible definition contained in the -globally visible environment, known as the `module `__. -Global identifiers must be unique. - -See :py:class:`~tvm.relay.expr.GlobalVar` for its implementation -and documentation. - -Local Variable -~~~~~~~~~~~~~~ - -Local identifiers are prefixed by the :code:`%` sigil, -such as ":code:`%local`". A local identifier always references -a function argument or a variable bound in a :code:`let` expression, -and will be scoped to the function where it appears or the :code:`let` -expression where it is bound, respectively. - -In the below code segment, notice that :code:`%a` is defined twice. This is -permitted, as in most functional languages; in the scope of the second -:code:`let` expression, the name :code:`%a` is "shadowed," meaning all -references to :code:`%a` in the inner scope refer to the later definition, while -references to :code:`%a` in the outer scope continue to refer to -the first one. - -.. code-block:: - - let %a = 1; - let %b = 2 * %a; // %b = 2 - let %a = %a + %a; // %a = 2. %a is shadowed - %a + %b // has value 2 + 2 = 4 - -(Note that in Relay's implementation, each definition of a local variable -creates a new :py:class:`~tvm.relay.expr.Var`, so a shadowed local variable, -despite having the same name as one in an outer scope, will be a different -object. This allows for comparing local variables by pointer identity with the -knowledge that the same local variable object corresponds to a different binding site.) - -See :py:class:`~tvm.relay.expr.Var` for its implementation -and documentation. - -Functions -========= - -Functions in Relay act similarly to procedures or functions in -other programming languages and serve to generalize the concept -of a named subgraph. - -Functions are first class in Relay, which means they are expressions just like variables, constants, and tuples. -Additionally, functions in Relay are higher-order, which means that a function can be passed as an argument to a -function or returned by a function, as function expressions evaluate to closures (see the `Closures`_ subsection), -which are values like tensors and tuples. - -See :py:class:`~tvm.relay.function.Function` for the definition and documentation of function nodes. - -Syntax -~~~~~~ - -A definition minimally consists of the keyword :code:`fn`, an empty set of -parameters, and a body expression (:py:class:`~tvm.relay.expr.Expr`) -contained by curly braces. - -.. code-block:: - - fn() { body } - -A definition may contain any number of parameters. For example, a -simple function that invokes the :code:`add` operator: - -.. code-block:: - - fn(%x, %y) { add(%x, %y) } - -Notice that within the function's body, the parameters are local -variables, just like those bound in a :code:`let` expression. - -One may also annotate explicit types on functions. -For example, we can restrict the above function to only work -on certain types: - -.. code-block:: - - fn(%x : Tensor[(10, 10), float32], %y : Tensor[(10, 10), float32]) - -> Tensor[(10, 10), float32] { - add(%x, %y) - } - -The above function only takes arguments of type :code:`Tensor[(10, 10), float32]` and returns a value of -type :code:`Tensor[(10, 10), float32]`. A function parameter is just a local -variable (:py:class:`~tvm.relay.expr.LocalVar`) optionally annotated with a type, written as :code:`%x : T`. - -When the type information is omitted, Relay attempts to infer the most general type -for the users. This property is known as generalization: for a definition without -explicit annotations, Relay attempts to assign the most general type to the -parameters and return type based on the function body and call sites. - -A recursive function expression can be defined using a :code:`let` binding, -as here: - -.. code-block:: - - let %fact = fn(%x : Tensor[(10, 10), float32]) -> Tensor[(10, 10), float32] { - if (%x == Constant(0, (10, 10), float32)) { - Constant(1, (10, 10), float32) - } else { - %x * %fact(%x - Constant(1, (10, 10), float32)) - } - }; - %fact(Constant(10, (10, 10), float32)) - -Closures -~~~~~~~~ - -A function expression evaluates to a closure. Closures -are values that are represented as a pair of a local environment -(storing the values for all variables defined outside the scope -of the function's body) and the function itself. - -For example, in the below example, the final result will be -a tensor of zero values because the closure for :code:`%f` stores the value of -:code:`%x` at the pointer where :code:`%f` was defined. - -.. code-block:: - - let %g = fn() { - let %x = Constant(0, (10, 10), float32); - // %x is a free variable in the below function - fn(%y) { %y * %x } - }; - // the %x in %g's body is not in scope anymore - // %f is a closure where %x maps to Constant(0, (10, 10), float32) - let %f = %g(); - let %x = Constant(1, (10, 10), float32); - %f(%x) // evaluates to Constant(0, (10, 10), float32) - -Polymorphism and Type Relations -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -*Note: type parameter syntax is not yet supported in the text format.* - -A function may also be given a set of type parameters, which can be -substituted for specific types at call sites. Functions with -type parameters are *type polymorphic*; their return type or the types -of arguments they will accept can vary based on the type arguments -given at call sites. - -Type parameters are classified by *kind* and can -only appear in parts of the type signature where their kind is appropriate -(e.g., type parameters of kind :code:`Shape` can only appear where a shape -would be expected in a tensor type); for a full discussion, -see :ref:`the documentation on type parameters `. - -For example, one can define a polymorphic identity function for -any Relay type as follows: - -.. code-block:: - - fn(%x : t) -> t { - %x - } - -The below definition is also polymorphic, but restricts its -arguments to tensor types: - -.. code-block:: - - fn(%x : Tensor[s, bt]) { - %x - } - -Notice that the return type is omitted and will be inferred. - -*Note: "where" syntax is not yet supported in the text format.* - -A function may also be subject to one or more type relations, such as in -the following: - -.. code-block:: - - fn(%x, %y) where Broadcast { add(%x, %y) } - -In the above definition, the types of :code:`%x` and :code:`%y` and the return type -are subject to the :code:`Broadcast` relation, meaning all three must be tensors -and their shapes follow the elementwise broadcast relation. As with -operators, the definitions of relations are not transparent to Relay -and they are instead implemented externally in either C++ or Python. - -As in the case of :code:`Broadcast`, relations are used to express complicated -constraints on types (especially tensor shapes). -All function relations must hold at all call sites; -type checking is thus treated as a constraint-solving problem. -For more detail on type relations and their implementations, -please see :ref:`their section in the documentation on Relay's type system `. - -Operators -========= - -An operator is a primitive operation, such as :code:`add` or :code:`conv2d`, not defined in the Relay -language. Operators are declared in the global operator -registry in C++. Many common operators are backed by TVM's -Tensor Operator Inventory. - -To register an operator a user must provide an implementation -of the operator, its type, and any other desired metadata. -The operator registry is a column-based store where -operators are keys, so any metadata (which might be referenced -by optimization passes) may be registered as a new column. - -From the perspective of Relay's type system, an operator is a function, -so operators may be called like any other function and have function -types. In particular, operator types are registered using a single -type relation (see :ref:`the documentation on type relations `), typically a relation -specialized to that operator. For example, the :code:`add` operator -is registered with the :code:`Broadcast` relation, indicating that the -arguments of :code:`add` must be tensors and that the return type -is a tensor whose shape depends on those of its arguments. - -Operators are rendered without a sigil (e.g :code:`conv2d`, :code:`flatten`) -when pretty-printing Relay programs. -Operators are explicitly contained in the program and are uniquely -identifiable by pointer. - -Note that common arithmetic operators such as :code:`add` and :code:`multiply` -may be written using the corresponding arithmetic operators in the text format -(e.g., :code:`+` or :code:`*`) as syntactic sugar. - -See :py:class:`~tvm.relay.op.Op` for the definition and documentation -of operator nodes, demonstrating the infrastructure for registering -operator metadata. The other files in :py:class:`~tvm.relay.op` give -handles for generating a call to various pre-registered operators. -The :ref:`tutorial on adding operators to Relay ` shows how to add further -operators into the language. - -ADT Constructors -================ - -Algebraic data types (ADTs) in Relay are described in detail in a -:ref:`separate overview` and their integration into -the type system is described :ref:`here`. - -In this section, we will simply note that ADT constructors are given -a function type and should be used inside call nodes like a function -or operator. An ADT constructor is defined by giving the name of -the ADT it constructs (a global type variable) and the types of the -expected arguments for the constructor. - -If the ADT definition includes type variables, those type variables -may appear in the constructor. Constructors cannot include any other -type variables. - -Let us suppose that :code:`D` is an ADT that takes type parameters -:code:`a` and :code:`b`. If :code:`C1` is a constructor for :code:`D` -and expects two arguments, one of type :code:`a` and one of type :code:`b`, then -:code:`C1` has the following type signature: -:code:`fun(a, b) -> D[a, b]`. (See either the ADT overview -or the discussion of ADT typing for an explanation of the type call -in the return type.) -If another constructor for :code:`D`, :code:`C2`, takes no arguments, -then it has the following type signature: :code:`fun() -> D[a, b]`; -the type parameters will always appear in the return type. - -Once called, a constructor produces an ADT instance, which is a -container that stores the values of the arguments to the constructor -as well as the name ("tag") of the constructor. The tag will be used -for deconstructing the instances and retrieving the values when -`ADT Matching`_. - -See :py:class:`~tvm.relay.adt.Constructor` for the definition and documentation. - -Call -==== - -Expressions with function types in Relay are "callable," -meaning that they can be invoked via a function call. These consist of -any expression that evaluates to a closure (i.e., function expressions -or global functions) and Relay operators. - -The syntax of calls follows that used in C-like languages, demonstrated in the -example below: - -.. code-block:: - - let %c = 1; - let %f = fn(%x : Tensor[(), float32], %y : Tensor[(), float32]) { %x + %y + %c }; - %f(10, 11) - -When a closure is called (see `Closures`_), -the closure's body is evaluated in the stored environment -(i.e., using the stored values for free variables) with -local variable bindings added for each argument; the final value -obtained by evaluating the body is the call's return value. -Thus, in the above example, the call evaluates to 22. -In the case of operators, the implementation is opaque to Relay, -so the result is left up to the registered TVM implementation. - -*Note: type parameters are not yet supported in the text format.* - -A type-polymorphic function can also include type arguments at a call -site. The type arguments are substituted for type parameters when -type checking. If a function is type-polymorphic and type arguments are not -given, type inference will attempt to infer type arguments if possible. -The following code gives examples of explicit and inferred type arguments: - -.. code-block:: - - // %f : fn(a, b) -> c - let %x1 = %f(True, False); - // %x1 is of type Tensor[(), bool] - let %x2 : () = %f(%x1, %x1) - // the type arguments in the second call are inferred to be - -Note that all type relations in the function type must hold at each -call site. Specifically, this means that the relation will be checked -against the specific types of the arguments at a given call site. This -is also a form of polymorphism, since there may be multiple valid -assignments of argument types and a return type so long as the relation -is satisfied. - -For example, if we have a function :code:`%f` that takes tensor arguments -and has the :code:`Broadcast` relation, then there are many different -shapes that the arguments in the below call could have that would satisfy -the type annotation: - -.. code-block:: - - let %x : Tensor[(100, 100, 100), float32] = %f(%a, %b); - %x - -See :py:class:`~tvm.relay.expr.Call` for its definition and documentation. - -.. _module-description: - -Module and Global Functions -=========================== - -Relay keeps a global data structure known as a "module" (often called an "environment" in other -functional programming languages) to keep track of the definitions of global functions. -In particular, the module keeps a globally accessible mapping of global variables to the -function expressions they denote. The utility of the module is that it allows global functions -to recursively refer to themselves or any other global function (e.g., as in mutual recursion). - -Note Relay's module is analogous to data structures for keeping track of subgraphs in computation -graph-based IRs. - -Global functions in Relay behave identically to the function expressions defined in `Functions`_, -but have syntactic sugar in the text format to enter their definitions into the module. Namely, -a global function definition includes a global identifier and is allowed to recursively refer to -that identifier in the body, as in the following example: - -.. code-block:: - - def @ackermann(%m : Tensor[(), int32], %n : Tensor[(), int32]) -> Tensor[(), int32] { - if (%m == 0) { - %n + 1 - } else if (%m > 0 && %n == 0) { - @ackermann(%m - 1, 1) - } else { - @ackermann(%m - 1, @ackermann(%m, %n - 1)) - } - } - -This definition would result in a module entry mapping the identifier :code:`@ackermann` to a function expression -with the parameters, return type, and body above. Any reference to the identifier :code:`@ackermann` elsewhere in the -code could then look up the identifier in the module and replace the function definition as needed. - -See :py:class:`~tvm.IRModule` for the definition and documentation of a module. - -Constant -======== - -This node represents a constant tensor value -(see :py:mod:`~tvm.relay.Value` for more details). -A constant is represented as a :py:class:`~tvm.NDArray`, -allowing Relay to utilize TVM operators for constant evaluation. - -This node can also represent scalar constants, since -scalars are tensors with a shape of :code:`()`. In the text format, numerical -and boolean literals are thus syntactic sugar for constants encoding a -tensor type with a rank-zero shape. - -See :py:class:`~tvm.relay.expr.Constant` for its definition and documentation. - -Tuples -====== - -Construction -~~~~~~~~~~~~ - -The tuple node builds a finite (that is, of statically known size) sequence of heterogeneous data. -These tuples match Python's closely, and their fixed length allows for efficient projection of their -members. - -.. code-block:: - - fn(%a : Tensor[(10, 10), float32], %b : float32, %c : Tensor[(100, 100), float32]) { - let %tup = (%a, %b); // type: (Tensor[(10, 10), float32], float32) - ((%tup.0 + %tup.1), %c) // type: (Tensor[(10, 10), float32], Tensor[(100, 100), float32]) - } - -See :py:class:`~tvm.relay.expr.Tuple` for its definition and documentation. - -Projection -~~~~~~~~~~ - -A tuple must be indexed by an integer constant in order to extract a -particular member of the tuple. Projections are 0-indexed. - -For example, the below projection evaluates to :code:`%b`: - -.. code-block:: - - (%a, %b, %c).1 - -See :py:class:`~tvm.relay.expr.TupleGetItem` for its definition and documentation. - -Let Bindings -============ - -A :code:`let` binding is an immutable local variable binding, -allowing the user to bind an expression to a name. - -A :code:`let` binding contains a local variable, -an optional type annotation, a value, and a body expression -that may reference the bound identifier. If a type annotation -on the bound variable is omitted, Relay attempts to infer the -most general type permitted for the variable. - -The bound variable in a :code:`let` expression is only in scope -in its body, except when the variable defines a function expression. -When a :code:`let` expression creates a function, the variable is also -in scope in its value to allow for recursively defined functions -(see the previous subsection). - -The value of a :code:`let` binding is the value of the final expression -after evaluating the bindings it depends on. For example, in the -following example the entire expression evaluates to a tensor -of shape :code:`(10, 10)` where all elements are 1: - -.. code-block:: - - let %x : Tensor[(10, 10), float32] = Constant(1, (10, 10), float32); - %x + %x - -A sequence of :code:`let` bindings can be considered as a dataflow graph, -where the bindings are a series of sub-graphs connected -by bound variables. Since these binding sequences are -pure, a pair of bindings where neither depends on the other can be safely reordered. -For example, the first and second :code:`let` bindings below -may be evaluated in either order because neither has a dataflow -dependency on the other: - -.. code-block:: - - let %x = %a + %b; - let %y = %c + %d; - %x * %y - -See :py:class:`~tvm.relay.expr.Let` for its definition and documentation. - -Graph Bindings -============== - -A :code:`let` binding creates a named variable that is bound to the given value -and scoped to the subsequent expression. By contrast, a graph binding allows for -explicitly constructing dataflow graphs in a Relay program by binding an expression -(graph node) directly to a temporary variable, which is not scoped. Each reference -to the variable corresponds to an edge in the dataflow graph. This has the -semantics of substituting the expression wherever the variable appears, even though -the graph node will only be evaluated once by the compiled program. - -These bindings allow for a style of programming that corresponds to that already -employed by NNVM and other dataflow graph-based input formats. The fact that the variables -are not scoped offers some flexibility in evaluation order compared to :code:`let` -bindings, though this can also introduce some ambiguity in programs. - -*Note: Graph bindings are not currently parsed by the text format.* - -In Relay's text format, a graph binding can be written as below (note the lack of a -:code:`let` keyword and a semicolon): - -.. code-block:: - - %1 = %a + %b - %2 = %1 + %1 - %2 * %2 - -Unlike a let binding, a graph binding is not represented as an AST node in Relay, but rather as a meta-variable referencing its AST node value. -For example, a program like the above could be constructed in Relay's -Python front-end by setting *Python variables* equal to the corresponding Relay AST node and -using the variables repeatedly, as below (a C++ program using the corresponding API bindings -could accomplish the same thing): - -.. code-block:: - - sum1 = relay.add(a, b) - sum2 = relay.add(sum1, sum1) - relay.multiply(sum2, sum2) - -For development purposes and to enable certain optimizations, Relay includes passes to -convert between dataflow graphs defined using graph bindings and programs with :code:`let` -bindings in A-normal form, employed by many compiler optimizations from the functional -programming community (see `"A-Normalization: Why and How" by -Matt Might `__ for an introduction -to A-normal form). - -If-Then-Else -============ - -Relay has a simple if-then-else expression that allows programs to branch -on a single value of type :code:`bool`, i.e., a zero-rank -tensor of booleans (:code:`Tensor[(), bool]`). - -.. code-block:: - - if (%t == %u) { - %t - } else { - %u - } - -Since if-then-else branches are expressions, they may appear inline -wherever any other expression may be expected, like invocations of -the ternary operator in C-like languages. The if-then-else expression -evaluates to the value of the "then" branch if the condition value -evaluates to :code:`True` and evaluates to the value of the "else" branch if the -condition value evaluates to :code:`False`. - -See :py:class:`~tvm.relay.expr.If` for its definition and documentation. - -ADT Matching -============ - -Instances of algebraic data types (ADTs), as discussed in the -:ref:`ADT overview`, are containers that store the -arguments passed to the constructor used to create them, tagged by -the constructor name. - -Match expressions in Relay allow for retrieving the values stored in -an ADT instance ("deconstructing" it) based on their constructor tag. -A match expression behaves similarly to a C-style :code:`switch` statement, -branching on the different possible constructors for the type of the -value being deconstructed. As the ADT overview details, match -expressions are capable of more general pattern-matching than simply -splitting by constructors: any ADT instance nested inside an instance -(e.g., a list of lists) can be deconstructed at the same time as -the outer instance, while the different fields of the instance can be -bound to variables. (See :ref:`this section` for a detailed -description of ADT pattern-matching.) - -A match expression is defined using the -input value (an expression) and a list of clauses, each of which -consists of a pattern and an expression. When executed, the *first* -clause whose pattern matches the structure of the queried value is -executed; the clause expression is evaluated and returned. - -For example, suppose we have an ADT for natural numbers: - -.. code-block:: - - data Nat { - Z : () -> Nat # zero - S : (Nat) -> Nat # successor (+1) to a nat - } - -Then the following function subtracts one from a passed nat: - -.. code-block:: - - fn(%v: Nat[]) -> Nat[] { - match(%v) { - case Z() { Z() } - case S(%n) { %n } # the variable %n is bound in the scope of this clause - } - } - -The following function subtracts two from its argument if it is at least -two and returns the argument otherwise, using a nested constructor pattern: - -.. code-block:: - - fn(%v : Nat[]) -> Nat[] { - match(%v) { - case S(S(%n)) { %n } - # wildcard pattern: matches all cases not matched already - case _ { %v } - } - } - -As aforementioned, the ordering of match clauses is relevant. -In the below example, the first clause will always match so -those below it can never run: - -.. code-block:: - - fn(%v : Nat[]) -> Nat[] { - match(%v) { - case _ { %v } - case S(S(%n)) { S(%n) } - case S(%n) { %n } - case Z() { S(Z()) } - } - } - -See :py:class:`~tvm.relay.adt.Match` for its definition and documentation. - -TempExprs -========= - -Program transformations (passes) in Relay may require inserting temporary -state into the program AST to guide further transformations. The -:code:`TempExpr` node is provided as a utility to developers for this purpose; -nodes inheriting from :code:`TempExpr` cannot appear directly in user-provided -code but may be inserted in a pass. Any :code:`TempExpr` created in a pass -should ideally be eliminated before the pass is complete, as a -:code:`TempExpr` only stores internal state and has no semantics of its own. - -For an example of :code:`TempExpr` being used in a pass, see -:code:`src/relay/transforms/fold_scale_axis.cc`, which uses -:code:`TempExpr` nodes to store information about scaling parameters -as the pass tries to fold these into the weights of a convolution. - -See :py:class:`~tvm.relay.expr.TempExpr` for its definition and documentation. diff --git a/docs/reference/langref/relay_op.rst b/docs/reference/langref/relay_op.rst deleted file mode 100644 index 8bc24b9ab865..000000000000 --- a/docs/reference/langref/relay_op.rst +++ /dev/null @@ -1,247 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Relay Core Tensor Operators -=========================== - -This page contains the list of core tensor operator primitives pre-defined in tvm.relay. -The core tensor operator primitives cover typical workloads in deep learning. -They can represent workloads in front-end frameworks and provide basic building blocks for optimization. -Since deep learning is a fast evolving field, it is possible to have operators that are not in here. - - -.. note:: - - This document will directly list the function signature of - these operators in the python frontend. - - -Overview of Operators ---------------------- -**Level 1: Basic Operators** - -This level enables fully connected multi-layer perceptron. - -.. autosummary:: - :nosignatures: - - tvm.relay.log - tvm.relay.sqrt - tvm.relay.rsqrt - tvm.relay.exp - tvm.relay.sigmoid - tvm.relay.add - tvm.relay.subtract - tvm.relay.multiply - tvm.relay.divide - tvm.relay.mod - tvm.relay.tanh - tvm.relay.concatenate - tvm.relay.expand_dims - tvm.relay.nn.softmax - tvm.relay.nn.log_softmax - tvm.relay.nn.relu - tvm.relay.nn.dropout - tvm.relay.nn.batch_norm - tvm.relay.nn.bias_add - - -**Level 2: Convolutions** - -This level enables typical convnet models. - -.. autosummary:: - :nosignatures: - - tvm.relay.nn.conv2d - tvm.relay.nn.conv2d_transpose - tvm.relay.nn.conv3d - tvm.relay.nn.conv3d_transpose - tvm.relay.nn.dense - tvm.relay.nn.max_pool2d - tvm.relay.nn.max_pool3d - tvm.relay.nn.avg_pool2d - tvm.relay.nn.avg_pool3d - tvm.relay.nn.global_max_pool2d - tvm.relay.nn.global_avg_pool2d - tvm.relay.nn.upsampling - tvm.relay.nn.upsampling3d - tvm.relay.nn.batch_flatten - tvm.relay.nn.pad - tvm.relay.nn.lrn - tvm.relay.nn.l2_normalize - tvm.relay.nn.bitpack - tvm.relay.nn.bitserial_dense - tvm.relay.nn.bitserial_conv2d - tvm.relay.nn.contrib_conv2d_winograd_without_weight_transform - tvm.relay.nn.contrib_conv2d_winograd_weight_transform - tvm.relay.nn.contrib_conv3d_winograd_without_weight_transform - tvm.relay.nn.contrib_conv3d_winograd_weight_transform - - -**Level 3: Additional Math And Transform Operators** - -This level enables additional math and transform operators. - -.. autosummary:: - :nosignatures: - - tvm.relay.nn.leaky_relu - tvm.relay.nn.prelu - tvm.relay.reshape - tvm.relay.reshape_like - tvm.relay.copy - tvm.relay.transpose - tvm.relay.squeeze - tvm.relay.floor - tvm.relay.ceil - tvm.relay.sign - tvm.relay.trunc - tvm.relay.clip - tvm.relay.round - tvm.relay.abs - tvm.relay.negative - tvm.relay.take - tvm.relay.zeros - tvm.relay.zeros_like - tvm.relay.ones - tvm.relay.ones_like - tvm.relay.gather - tvm.relay.gather_nd - tvm.relay.full - tvm.relay.full_like - tvm.relay.cast - tvm.relay.reinterpret - tvm.relay.split - tvm.relay.arange - tvm.relay.meshgrid - tvm.relay.stack - tvm.relay.repeat - tvm.relay.tile - tvm.relay.reverse - tvm.relay.reverse_sequence - tvm.relay.unravel_index - tvm.relay.sparse_to_dense - - -**Level 4: Broadcast and Reductions** - -.. autosummary:: - :nosignatures: - - tvm.relay.right_shift - tvm.relay.left_shift - tvm.relay.equal - tvm.relay.not_equal - tvm.relay.greater - tvm.relay.greater_equal - tvm.relay.less - tvm.relay.less_equal - tvm.relay.all - tvm.relay.any - tvm.relay.logical_and - tvm.relay.logical_or - tvm.relay.logical_not - tvm.relay.logical_xor - tvm.relay.maximum - tvm.relay.minimum - tvm.relay.power - tvm.relay.where - tvm.relay.argmax - tvm.relay.argmin - tvm.relay.sum - tvm.relay.max - tvm.relay.min - tvm.relay.mean - tvm.relay.variance - tvm.relay.std - tvm.relay.mean_variance - tvm.relay.mean_std - tvm.relay.prod - tvm.relay.strided_slice - tvm.relay.broadcast_to - - -**Level 5: Vision/Image Operators** - -.. autosummary:: - :nosignatures: - - tvm.relay.image.resize1d - tvm.relay.image.resize2d - tvm.relay.image.resize3d - tvm.relay.image.crop_and_resize - tvm.relay.image.dilation2d - tvm.relay.vision.multibox_prior - tvm.relay.vision.multibox_transform_loc - tvm.relay.vision.nms - tvm.relay.vision.yolo_reorg - - -**Level 6: Algorithm Operators** - -.. autosummary:: - :nosignatures: - - tvm.relay.argsort - tvm.relay.topk - - -**Level 10: Temporary Operators** - -This level support backpropagation of broadcast operators. It is temporary. - -.. autosummary:: - :nosignatures: - - tvm.relay.broadcast_to_like - tvm.relay.collapse_sum_like - tvm.relay.slice_like - tvm.relay.shape_of - tvm.relay.ndarray_size - tvm.relay.layout_transform - tvm.relay.device_copy - tvm.relay.annotation.on_device - tvm.relay.reverse_reshape - tvm.relay.sequence_mask - tvm.relay.nn.batch_matmul - tvm.relay.nn.adaptive_max_pool2d - tvm.relay.nn.adaptive_avg_pool2d - tvm.relay.one_hot - - -**Level 11: Dialect Operators** - -This level supports dialect operators. - -.. autosummary:: - :nosignatures: - - tvm.relay.qnn.op.add - tvm.relay.qnn.op.batch_matmul - tvm.relay.qnn.op.concatenate - tvm.relay.qnn.op.conv2d - tvm.relay.qnn.op.conv2d_transpose - tvm.relay.qnn.op.dense - tvm.relay.qnn.op.dequantize - tvm.relay.qnn.op.mul - tvm.relay.qnn.op.quantize - tvm.relay.qnn.op.requantize - tvm.relay.qnn.op.rsqrt - tvm.relay.qnn.op.simulated_dequantize - tvm.relay.qnn.op.simulated_quantize - tvm.relay.qnn.op.subtract diff --git a/docs/reference/langref/relay_pattern.rst b/docs/reference/langref/relay_pattern.rst deleted file mode 100644 index a80c55323b98..000000000000 --- a/docs/reference/langref/relay_pattern.rst +++ /dev/null @@ -1,549 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - - -========================= -Pattern Matching in Relay -========================= - -There are many places in TVM where we identify pure data-flow sub-graphs of the Relay program and attempt to transform them in some way example passes include fusion, quantization, external code generation, and device specific optimizations. - -Many of these passes today require a lots of boring boilerplate code in order to implement as well as requiring users to think in terms of visitors and AST matching. Many of these transformations can easily be described in terms of graph rewrites. In order to build a rewriter or other advanced machinery we first need a language of patterns to describe what we can match. - -Such a language is not just useful for building a rewriter but also providing extension points for existing passes. For example the fusion pass could be parameterized by a set of fusion patterns which describes the capability of your hardware, and the quantization pass could take a set of patterns which describe which operators can be quantized on a given platform. - -In the backend world, we could use the same machinery to build a higher level API using bring your own code generation. This API takes set of patterns describing your hardware capabilities and an external compiler, providing a relatively smooth heterogeneous experience out of the box. - -Pattern Examples -================ - -There are quite a few properties of operators that are worth matching. Below we examine how to match tree properties, and expand on some use cases that are not fully explored in the prototype. This section -demonstrates how to write patterns. It is recommended to check `tests/python/relay/test_dataflow_pattern.py`_ -for more use cases. - -.. _tests/python/relay/test_dataflow_pattern.py: https://github.com/apache/tvm/blob/main/tests/python/relay/test_dataflow_pattern.py - -.. note:: - - If you cannot find the corresponding pattern node to match the Relay node you want, - you are welcome to raise an issue or submit a PR to add it. - -Matching One of Two Ops -*********************** - -The first example is a simple case where we want to match one operator with a single input OR -another operator with a single input: - -.. code-block:: python - - def test_match_op_or(): - is_add_or_sub = is_op('add') | is_op('subtract') - assert is_add_or_sub.match(relay.op.op.get("add")) - assert is_add_or_sub.match(relay.op.op.get("subtract")) - - -Matching an Op with Attributes -****************************** - -The next example is a dense operation with any operator that is marked element-wise: - -.. code-block:: python - - def test_no_match_attr(): - op = is_op('nn.dense').has_attr({"TOpPattern": K_ELEMWISE}) - op_pat = op(wildcard(), wildcard()) - x = relay.var('x') - y = relay.var('y') - assert not op_pat.match(relay.op.nn.dense(x, y)) - -Here is another example to match an op with a specific attribute: - -.. code-block:: python - - def test_match_data_layout(): - is_conv2d = is_op('nn.conv2d')(wildcard(), wildcard()).has_attr({"data_layout": "NHWC"}) - x = relay.var('x') - y = relay.var('y') - assert not is_conv2d.match(relay.op.nn.conv2d(x, y)) - -Or a convolution with a specific kernel size: - -.. code-block:: python - - def test_match_kernel_size(): - is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard()).has_attr({"kernel_size": [3, 3]}) - x = relay.var('x') - y = relay.var('y') - assert is_conv2d.match(relay.op.nn.conv2d(x, y, kernel_size=[3, 3])) - - - -Matching an Optional Op -*********************** - -The next example is matching a pattern with one optional operator. In this pattern, -we can match the graph of conv2d+bias_add+relu or the graph of conv2d+bias_add. - -.. code-block:: python - - def test_match_optional(): - conv_node = is_op('nn.conv2d')(wildcard(), wildcard()) - bias_node = is_op('nn.bias_add')(conv_node, wildcard()) - pat = bias_node.optional(lambda x: is_op('nn.relu')(x)) - - x = relay.var('x') - y = relay.var('y') - z = relay.var('z') - conv2d = relay.op.nn.conv2d(x, y) - bias = relay.op.nn.bias_add(conv2d, z) - assert pat.match(bias) - relu = relay.op.nn.relu(bias) - assert pat.match(relu) - - -Matching Types -************** - -In addition to matching ops with attributes, we can also make a pattern to match their types, in interms of the shape and data type. Here are some examples: - -.. code-block:: python - - def test_match_type(): - # Match any op with float32 - pat1 = has_dtype('float32') - x = relay.var('x', shape=(10, 10), dtype='float32') - assert pat1.match(x) - - # Match any op with shape (10, 10) - pat2 = has_shape((10, 10)) - x = relay.var('x', shape=(10, 10), dtype='float32') - assert pat2.match(x) - - # Match conv2d+relu with a certain shape - conv2d = is_op('nn.conv2d')(wildcard(), wildcard()) - pat3 = is_op('nn.relu')(conv2d).has_shape((1, 32, 28, 28)) - - x = relay.var('x', shape=(1, 3, 28, 28), dtype='float32') - w = relay.var('w', shape=(32, 3, 3, 3), dtype='float32') - conv2d = relay.nn.conv2d(x, w, strides=(1, 1), padding=(1, 1)) - relu = relay.nn.relu(conv2d) - assert pat3.match(relu) - - -Matching Non-Call Nodes -*********************** - -Sometimes we may also want to match a pattern that includes Tuple or TupleGetItem nodes. -Since there are not call nodes, we need to use specific pattern nodes to match them: - -.. code-block:: python - - def test_match_tuple(): - x = relay.var('x') - y = relay.var('y') - z = relay.var('z') - tuple_pattern = is_tuple((wildcard(), wildcard(), wildcard())) - assert tuple_pattern.match(relay.expr.Tuple((x,y,z))) - -The next example is matching a pattern of batch_norm -> get(0) -> relu. Note that you can also use `is_tuple_get_item(bn_node)` to match a `TupleGetItem` node with any index. - -.. code-block:: python - - def test_match_tuple_get_item(): - bn_node = is_op('nn.batch_norm')(wildcard(), wildcard(), wildcard(), wildcard(), wildcard()) - tuple_get_item_node = is_tuple_get_item(bn_node, 0) - pat = is_op('nn.relu')(tuple_get_item_node) - - x = relay.var('x', shape=(1, 8)) - gamma = relay.var("gamma", shape=(8,)) - beta = relay.var("beta", shape=(8,)) - moving_mean = relay.var("moving_mean", shape=(8,)) - moving_var = relay.var("moving_var", shape=(8,)) - bn_node = relay.nn.batch_norm(x, gamma, beta, moving_mean, moving_var) - tuple_get_item_node = bn_node[0] - out = relay.nn.relu(tuple_get_item_node) - pat.match(out) - -If we have a pattern that crosses a function boundary, we might want to match the Function itself - - -.. code-block:: python - - def test_match_func(): - x = relay.var("x") - y = relay.var("y") - wc1 = wildcard() - wc2 = wildcard() - func_pattern = FunctionPattern([wc1, wc2], wc1 + wc2) - assert func_pattern.match(relay.Function([x, y], x + y)) - -The next example is matching a constant node regarding its values. This is useful to check -if a specific parameter in a subgraph has been bound or not. - -.. code-block:: python - - def test_match_constant(): - conv2d = is_op('nn.conv2d')(wildcard(), is_constant()) - pattern = is_op('nn.bias_add')(conv2d, wildcard()) - - x = relay.var('x', shape=(1, 3, 224, 224)) - w = relay.var('w', shape=(3, 3, 3, 3)) - b = relay.var('b', shape=(3, )) - conv2d = relay.op.nn.conv2d(x, w) - out = relay.op.nn.bias_add(conv2d, b) - func = relay.Function([x, w, b], out) - mod = tvm.IRModule.from_expr(func) - - # Two inputs of the conv2d in the graph are VarNode by default, so no match. - assert not pattern.match(mod['main'].body) - - # The second input (weight) has been bind with constant values so it is now a constant node. - mod["main"] = bind_params_by_name(mod["main"], - {'w': tvm.nd.array(np.ones(shape=(3, 3, 3, 3)))}) - assert pattern.match(mod['main'].body) - -On the other hand, if you need to match the constant with a specific value, you can directly -use ``is_expr``. This could be useful for algebraic simplify. - -.. code-block:: python - - def test_match_plus_zero(): - zero = (is_expr(relay.const(0)) | is_expr(relay.const(0.0))) - pattern = wildcard() + zero - - x = relay.Var('x') - y = x + relay.const(0) - assert pattern.match(y) - -The next example is matching function nodes with a specific attribute: - -.. code-block:: python - - def test_match_function(): - pattern = wildcard().has_attr({"Composite": "add"}) - - x = relay.var('x') - y = relay.var('y') - f = relay.Function([x, y], x + y).with_attr("Composite", "add") - assert pattern.match(f) - -A Relay ``If`` expression can be matched if all of its condition, true branch and false branch -are matched: - -.. code-block:: python - - def test_match_if(): - x = is_var("x") - y = is_var("y") - pat = is_if(is_op("less")(x, y), x, y) - - x = relay.var("x") - y = relay.var("y") - cond = x < y - - assert pat.match(relay.expr.If(cond, x, y)) - - -A Relay ``Let`` expression can be matched if all of its variable, value, and body -are matched: - -.. code-block:: python - - def test_match_let(): - x = is_var("x") - y = is_var("y") - let_var = is_var("let") - pat = is_let(let_var, is_op("less")(x, y), let_var) - - x = relay.var("x") - y = relay.var("y") - lv = relay.var("let") - cond = x < y - assert pat.match(relay.expr.Let(lv, cond, lv)) - -Matching Diamonds and Post-Dominator Graphs -******************************************* - -The next example is matching a diamond with two inputs at the top of the diamond:: - - def test_match_diamond(): - # Pattern - is_conv2d = is_op('nn.conv2d')(is_var(), is_var()) - path1 = is_op('nn.relu')(is_conv2d) - path2 = is_op('nn.leaky_relu')(is_conv2d) - diamond = is_op('add')(path1, path2) - - # Expr - inp = relay.var('input') - weight = relay.var('weight') - conv2d = relay.op.nn.conv2d(inp, weight) - relu = relay.op.nn.relu(conv2d) - leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0) - out = relu + leaky_relu - - # Check - assert diamond.match(out) - -The final example is matching diamonds with a post-dominator relationship. We embed dominator analysis as type of matching in the pattern language in order to allow for pattern matching with unknown topology. This is important because we want to be able to use the language to describe fuse patterns, like elementwise operations followed by a conv2d:: - - def test_match_dom_diamond(): - # Pattern - is_conv2d = is_op('nn.conv2d')(is_var(), is_var()) - reduction = is_op('add')(wildcard(), wildcard()) - diamond = dominates(is_conv2d, is_elemwise, reduction) - - # Expr - inp = relay.var('input') - weight = relay.var('weight') - conv2d = relay.op.nn.conv2d(inp, weight) - relu = relay.op.nn.relu(conv2d) - leaky_relu = relay.op.nn.leaky_relu(conv2d, alpha=0) - out = relu + leaky_relu - - # Check - assert diamond.match(out) - - -Matching Fuzzy Patterns -======================= - -The Dominator analysis above lets one match a subgraph of Relay AST that doesn't correspond to a set of patterns nodes exactly 1-to-1. There are a few other places where we support such "fuzzy" matching. - -Tuples, Functions, and Call nodes with any number of inputs can be matched by passing `None` as the argument value, i.e.:: - - tuple_pattern = is_tuple(None) - func_pattern = FunctionPattern(None, wildcard() + wildcard()) - call_pattern = func_pattern(None) - -These patterns allow matching more generic classes patterns by constraining the use of the arguments rather than the number of arguments. - -Additionally, we support matching Functions with fuzzy bodies, i.e., a function body that is under constrained by the pattern. The pattern `FunctionPattern([is_var(), is_var()], wildcard() + wildcard()])` will match `relay.Function([x, y], x + y)`, but it will also match `relay.Function([x, y], x * x + y)`. In the second case, the pattern doesn't perfectly constrain the body of the function, so the resulting match is fuzzy. - - -Pattern Language Design -======================= - -The pattern language proposed is designed to be a mirror of Relay's IR with additional support for common scenarios. The goal of the pattern language is to provide a regular-expression like capability for matching data-flow graphs and doing rewriting. - -The high level design is to introduce a language of patterns for now we propose the language as:: - - Pattern ::= expr - | * - | pattern(pattern1, ... patternN) - | has_type(type) - | has_dtype(type) - | has_shape(shape) - | has_attr(attrs) - | is_var(name) - | is_constant() - | is_expr(expr) - | is_op(op_name) - | is_tuple() - | is_tuple_get_item(pattern, index = None) - | is_if(cond, tru, fls) - | is_let(var, value, body) - | pattern1 `|` pattern2 - | dominates(parent_pattern, path_pattern, child_pattern) - | FunctionPattern(params, body) - -The above language then provides a matching interface with both can select sub-graphs as well as verify that the graph does match the pattern. - -Expression Pattern -****************** - -Match a literal expression. - -Wildcard -******** - -Match any expression. - -Type Pattern -************ - -Check that the expression matched by the nested pattern has a particular type. - -DType Pattern -************* - -Check that the expression matched by the nested pattern has a particular data type. - -Shape Pattern -************* - -Check that the expression matched by the nested pattern has a particular output shape. - -Attribute Pattern -***************** - -Check that the operator matched by the pattern has an attribute with a particular value. - -Variable Pattern -**************** - -Check that the expression is a relay Variable, and optional provide a name to match to the Variable name. - - -Alternate -********* - -Either match the first pattern or the second pattern. - -Domination -********** - -Match child pattern, find a match for the parent pattern, insuring that the child ultimately dominates the parent (i.e., no nodes outside the pattern use outputs of the parent), and that ever node between the child and the pattern matches the path pattern. - -Function Pattern -**************** - -Match a Function with a body and parameters - -If Pattern -********** - -Match an If with condition, true branch, and false branch - -Let Pattern -*********** - -Match a Let with a variable, value, and body - -Applications -============ - -The pattern language provides not only the pattern matching but also pattern processing. -Here we introduce two pattern processing approaches and provide some examples. - -Pattern Rewriting -***************** - -If you would like to replace the matched pattern with another subgraph, you can leverage -the ``rewrite`` transformation. Here is an example of rewriting a series of arithmetic operators -with a single batch_norm op. The constructor parameter ``require_type`` indicates whether InferType -is required to be run before the callback. - -.. code-block:: python - - class BatchnormCallback(DFPatternCallback): - # A callback class to rewrite the matched pattern to a batch_norm op. - def __init__(self, require_type=False): - super().__init__(require_type) - self.x = wildcard() - self.var = wildcard() - self.mean = wildcard() - self.beta = wildcard() - self.gamma = wildcard() - self.eps = wildcard() - - self.pattern = self.gamma * (self.x - self.mean)/is_op("sqrt")(self.var + self.eps) + self.beta - - def callback(self, pre, post, node_map): - x = node_map[self.x][0] - var = node_map[self.var][0] - mean = node_map[self.mean][0] - beta = node_map[self.beta][0] - gamma = node_map[self.gamma][0] - eps = node_map[self.eps][0] - return relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon = eps.data.numpy().item())[0] - - # A graph of arithmetic operators that are functional equivalent to batch_norm. - x = relay.var('x') - var = relay.var('var') - mean = relay.var('mean') - beta = relay.var('beta') - gamma = relay.var('gamma') - BN = gamma * (x - mean)/relay.op.sqrt(var + relay.const(1e-5)) + beta - - from tvm.relay.dataflow_pattern import rewrite - out = rewrite(BatchnormCallback(), BN) - assert tvm.ir.structural_equal(out, relay.op.nn.batch_norm(x, gamma, beta, mean, var, epsilon = 1e-5)[0]) - -The function ``def callback(self, pre, post, node_map)`` will be invoked when the rewriter matches -``self.pattern``. ``node_map`` is a dictionary mapping from pattern nodes to matched nodes in the graph. - -The callback function will be invoked recursively on the returned pattern until the pattern stops changing. As a result, if ``self.pattern`` matches any part of the graph that the callback returned, the rewriter will run in a loop. If you want to avoid multiple rewrites, you can pass a ``rewrite_once=True`` parameter to the constructor. - -Pattern Partitioning -******************** - -If you would like to perform a more complex processing for matched subgraphs and you are not -satisfied with ``rewrite``, you may consider partitioning the matched subgraphs to a separate -Relay function and perform other processes to the function. Here we use ``pattern.partition`` -to create a new Relay function for each matched subgraph. The functionality is similar to -the op fusion pass in TVM: - -.. code-block:: python - - # A pattern matching conv2d+relu. - pattern = is_op("nn.relu")(is_op("nn.conv2d")(wildcard(), wildcard())) - - # A graph. - x = relay.var('input') - w = relay.var('weight') - conv2d = relay.op.nn.conv2d(x, w) - relu = relay.op.nn.relu(conv2d) - print('relu') - # free_var %x: Tensor[(1, 3, 224, 224), float32] - # free_var %w: Tensor[(3, 3, 3, 3), float32] - # %0 = nn.conv2d(%x, %w, padding=[0, 0, 0, 0]) /* ty=Tensor[(1, 3, 222, 222), float32] */; - # free_var %b: Tensor[(3), float32] - # nn.bias_add(%0, %b) /* ty=Tensor[(1, 3, 222, 222), float32] */ - - # After partition. - print(pattern.partition(relu)) - # free_var %x: Tensor[(1, 3, 224, 224), float32] - # free_var %w: Tensor[(3, 3, 3, 3), float32] - # free_var %b: Tensor[(3), float32] - # %1 = fn (%FunctionVar_0_0, %FunctionVar_0_1, - # %FunctionVar_0_2, PartitionedFromPattern="nn.conv2d_nn.bias_add_") { - # %0 = nn.conv2d(%FunctionVar_0_0, %FunctionVar_0_1, padding=[0, 0, 0, 0]); - # nn.bias_add(%0, %FunctionVar_0_2) - # }; - # %1(%x, %w, %b) - -Note that you can also specify the attributes for the created functions: - -.. code-block:: python - - print(pattern.partition(relu, {'Composite': 'one_layer'})) - # free_var %x: Tensor[(1, 3, 224, 224), float32] - # free_var %w: Tensor[(3, 3, 3, 3), float32] - # free_var %b: Tensor[(3), float32] - # %1 = fn (%FunctionVar_0_0, %FunctionVar_0_1, - # %FunctionVar_0_2, Composite="one_layer", - # PartitionedFromPattern="nn.conv2d_nn.bias_add_") { - # %0 = nn.conv2d(%FunctionVar_0_0, %FunctionVar_0_1, padding=[0, 0, 0, 0]); - # nn.bias_add(%0, %FunctionVar_0_2) - # }; - # %1(%x, %w, %b) - -If you need a customized checking function that cannot be specified using pattern language, -you can specify ``check`` function when partitioning. The following example demonstrates a -case that checks input data layout of a subgraph: - -.. code-block:: python - - def check(pre): - conv = pre.args[0] - return (conv.attrs.data_layout == "NCHW") and bool(conv.checked_type.shape[0] == 1) - - pattern.partition(relu, check=check) - -In this example, we check if the first argument of the matched subgraph (i.e., ``pre.args[0]``) -has data layout "NCHW" and if its batch size is 1. This feature is useful if the conditions -of matching a pattern cannot be verified by analyzing the pattern itself. diff --git a/docs/reference/langref/relay_type.rst b/docs/reference/langref/relay_type.rst deleted file mode 100644 index 632c6387c5cc..000000000000 --- a/docs/reference/langref/relay_type.rst +++ /dev/null @@ -1,398 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -=================== -Relay's Type System -=================== - -We briefly introduced types while detailing Relay's expression language, -but have not yet described its type system. Relay is -a statically typed and type-inferred language, allowing programs to -be fully typed while requiring just a few explicit type annotations. - -Static types are useful when performing compiler optimizations because they -communicate properties about the data a program manipulates, such as runtime -shape, data layout, and storage, without needing to run the program. -Relay's `Algebraic Data Types`_ allow for easily and flexibly composing -types in order to build data structures that can be -reasoned about inductively and used to write recursive functions. - -Relay's type system features a form of *dependent typing* for shapes. That is, its type system keeps track of the shapes of tensors in a Relay program. Treating tensor -shapes as types allows Relay to perform more powerful reasoning at compile time; -in particular, Relay can statically reason about operations whose output shapes -vary based on the input shapes in complex ways. Casting shape inference as a type -inference problem allows Relay to infer the shapes of all tensors at compile time, -including in programs that use branching and function calls. - -Statically reasoning about shapes in this manner allows -Relay to be ahead-of-time compiled and provides much more information about -tensors for optimizations further in the compilation pipeline. Such optimizations -can be implemented as passes, which are Relay-to-Relay AST transformations, and -may use the inferred types (e.g., shape information) for making decisions about -program transformations. For instance, :code:`src/relay/transforms/fuse_ops.cc` gives -an implementation of a pass that uses inferred tensor shapes to replace invocations -of operators in a Relay program with fused operator implementations. - -Reasoning about tensor types in Relay is encoded using *type relations*, which means -that the bulk of type checking in Relay is constraint solving (ensuring that all -type relations are satisfied at call sites). Type relations offer a flexible and -relatively simple way of making the power of dependent typing available in Relay -without greatly increasing the complexity of its type system. - -Below we detail the language of types in Relay and how they are assigned to Relay expressions. - -Type -==== - -The base type for all Relay types. All Relay types are sub-classes of this base type. - -See :py:class:`~tvm.relay.ty.Type` for its definition and documentation. - -Tensor Type -=========== - -A concrete tensor type in Relay. - -Tensors are typed according to data type and shape. At present, these use TVM's -data types and shapes, but in the future, Relay may include a separate AST for -shapes. In particular, data types include :code:`bool`, :code:`float32`, :code:`int8` and various -other bit widths and numbers of lanes. Shapes are given as tuples of dimensions (TVM :code:`IndexExpr`), -such as :code:`(5, 5)`; scalars are also given tuple types and have a shape of :code:`()`. - -Note, though, that TVM shapes can also include variables and arithmetic expressions -including variables, so Relay's constraint solving phase will attempt to find -assignments to all shape variables to ensure all shapes will be concrete before -running a program. - -For example, here is a simple concrete tensor type corresponding to a 10-by-10 tensor of 32-bit floats: - -.. code-block:: - - Tensor[(10, 10), float32] - -See :py:class:`~tvm.relay.ty.TensorType` for its definition and documentation. - -Tuple Type -========== - -A type of a tuple in Relay. - -Just as a tuple is simply a sequence of values of statically known length, the type -of a tuple consists of a sequence of the types corresponding to each member of the tuple. - -Because a tuple type is of statically known size, the type of a tuple projection -is simply the corresponding index into the tuple type. - -For example, in the below code, :code:`%t` is of type -:code:`(Tensor[(), bool], Tensor[(10, 10), float32])` -and :code:`%c` is of type :code:`Tensor[(10, 10), float32]`. - -.. code-block:: - - let %t = (False, Constant(1, (10, 10), float32)); - let %c = %t.1; - %c - -See :py:class:`~tvm.relay.ty.TupleType` for its definition and documentation. - -.. _type-parameter: - -Type Parameter -============== - -Type parameters represent placeholder types used for polymorphism in functions. -Type parameters are specified according to *kind*, corresponding to the types -those parameters are allowed to replace: - -- :code:`Type`, corresponding to top-level Relay types like tensor types, tuple types, and function types -- :code:`BaseType`, corresponding to the base type of a tensor (e.g., :code:`float32`, :code:`bool`) -- :code:`Shape`, corresponding to a tensor shape -- :code:`ShapeVar`, corresponding to variables within a tensor shape - -Relay's type system enforces that type parameters are only allowed to appear where their kind permits them, -so if type variable :code:`t` is of kind :code:`Type`, :code:`Tensor[t, float32]` is not a valid type. - -.. *Note: At present, only type parameters of kind :code:`Type` are supported.* - -Like normal parameters, concrete arguments must be given for type parameters at call sites. - -.. *Note: type parameter syntax is not yet supported in the text format.* - -For example, :code:`s` below is a type parameter of kind :code:`Shape` and it will -be substituted with :code:`(10, 10)` at the call site below: - -.. code-block:: - - def @plus(%t1 : Tensor[s, float32], %t2 : Tensor[s, float32]) { - add(%t1, %t2) - } - plus<(10, 10)>(%a, %b) - -See :py:class:`~tvm.relay.ty.TypeVar` for its definition and documentation. - -Type Constraint -=============== - -This is an abstract class representing a type constraint, to be elaborated -upon in further releases. Currently, type relations are the only -type constraints provided; they are discussed below. - -See :py:class:`~tvm.relay.ty.TypeConstraint` for its definition and documentation. - -Function Type -============= - -A function type in Relay, see `tvm/relay/type.h` for more details. - -This is the type assigned to functions in Relay. A function type -consists of a list of type parameters, a set of type constraints, -a sequence of argument types, and a return type. - -We informally write function types as: -:code:`fn(arg_types) -> ret_type where type_constraints` - -A type parameter in the function type may appear in the argument -types or the return types. Additionally, each of the type constraints -must hold at every call site of the function. The type constraints -typically take the function's argument types and the function's return -type as arguments, but may take a subset instead. - -See :py:class:`~tvm.relay.ty.FuncType` for its definition and documentation. - -.. _type-relation: - -Type Relation -============= - -A type relation is the most complex type system feature in Relay. -It allows users to extend type inference with new rules. -We use type relations to define types for operators that work with -tensor shapes in complex ways, such as broadcasting operators or -:code:`flatten`, allowing Relay to statically reason about the shapes -in these cases. - -A type relation :code:`R` describes a relationship between the input and output types of a Relay function. -Namely, :code:`R` is a function on types that -outputs `true` if the relationship holds and `false` -if it fails to hold. Types given to a relation may be incomplete or -include shape variables, so type inference must assign appropriate -values to incomplete types and shape variables for necessary relations -to hold, if such values exist. - -For example we can define an identity relation to be: - -.. code-block:: prolog - - Identity(I, I) :- true - -It is usually convenient to type operators -in Relay by defining a relation specific to that operator that -encodes all the necessary constraints on the argument types -and the return type. For example, we can define the relation for :code:`flatten`: - -.. code-block:: prolog - - Flatten(Tensor(sh, bt), O) :- - O = Tensor(sh[0], prod(sh[1:])) - -If we have a relation like :code:`Broadcast` it becomes possible -to type operators like :code:`add`: - -.. code-block:: - - add : fn(t1, t2) -> t3 - where Broadcast - -The inclusion of :code:`Broadcast` above indicates that the argument -types and the return type must be tensors where the shape of :code:`t3` is -the broadcast of the shapes of :code:`t1` and :code:`t2`. The type system will -accept any argument types and return type so long as they fulfill -:code:`Broadcast`. - -Note that the above example relations are written in Prolog-like syntax, -but currently the relations must be implemented by users in C++ -or Python. More specifically, Relay's type system uses an *ad hoc* solver -for type relations in which type relations are actually implemented as -C++ or Python functions that check whether the relation holds and -imperatively update any shape variables or incomplete types. In the current -implementation, the functions implementing relations should return :code:`False` -if the relation fails to hold and :code:`True` if the relation holds or if -there is not enough information to determine whether it holds or not. - -The functions for all the relations are run as needed (if an input is updated) -until one of the following conditions holds: - -1. All relations hold and no incomplete types remain (typechecking succeeds). -2. A relation fails to hold (a type error). -3. A fixpoint is reached where shape variables or incomplete types remain (either a type error or more type annotations may be needed). - -Presently all of the relations used in Relay are implemented in C++. -See the files in :code:`src/relay/op` for examples of relations implemented -in C++. - -See :py:class:`~tvm.relay.ty.TypeRelation` for its definition and documentation. - -Incomplete Type -=============== - -An incomplete type is a type or portion of a type that is not yet known. -This is only used during type inference. Any omitted type annotation is -replaced by an incomplete type, which will be replaced by another -type at a later point. - -Incomplete types are known as "type variables" or "type holes" in the programming languages -literature. We use the name "incomplete type" in order to more clearly distinguish them from type -parameters: Type parameters must be bound to a function and are replaced with concrete type arguments (instantiated) -at call sites, whereas incomplete types may appear anywhere in the program and are filled in during type inference. - -See :py:class:`~tvm.relay.ty.IncompleteType` for its definition and documentation. - -.. _adt-typing: - -Algebraic Data Types -==================== - -*Note: ADTs are not currently supported in the text format.* - -Algebraic data types (ADTs) are described in more detail in -:ref:`their overview `; this section describes -their implementation in the type system. - -An ADT is defined by a collection of named constructors, -each of which takes arguments of certain types. -An instance of an ADT is a container that stores the values -of the constructor arguments used to produce it as well as the -name of the constructor; the values can be retrieved by -deconstructing the instance by matching based on its constructor. -Hence, ADTs are sometimes called "tagged unions": like a C-style -union, the contents of an instance for a given ADT may have -different types in certain cases, but the constructor serves as a -tag to indicate how to interpret the contents. - -From the type system's perspective, it is most pertinent that -ADTs can take type parameters (constructor arguments can be -type parameters, though ADT instances with different type -parameters must be treated as different types) and be -recursive (a constructor for an ADT can take an instance of -that ADT, thus an ADT like a tree or list can be inductively -built up). The representation of ADTs in the type system must -be able to accommodate these facts, as the below sections will detail. - -Global Type Variable -~~~~~~~~~~~~~~~~~~~~ - -To represent ADTs compactly and easily allow for recursive ADT definitions, -an ADT definition is given a handle in the form of a global type variable -that uniquely identifies it. Each ADT definition is given a fresh global -type variable as a handle, so pointer equality can be used to distinguish -different ADT names. - -For the purposes of Relay's type system, ADTs are differentiated by name; -that means that if two ADTs have different handles, they will be -considered different types even if all their constructors are -structurally identical. - -Recursion in an ADT definition thus follows just like recursion for a -global function: the constructor can simply reference the ADT handle -(global type variable) in its definition. - -See :py:class:`~tvm.relay.ty.GlobalTypeVar` for its definition and documentation. - -Definitions (Type Data) -~~~~~~~~~~~~~~~~~~~~~~~ - -Besides a name, an ADT needs to store the constructors that are used -to define it and any type parameters used within them. These are -stored in the module, :ref:`analogous to global function definitions`. - -While type-checking uses of ADTs, the type system sometimes must -index into the module using the ADT name to look up information -about constructors. For example, if a constructor is being pattern-matched -in a match expression clause, the type-checker must check the constructor's -signature to ensure that any bound variables are being assigned the -correct types. - -See :py:class:`~tvm.relay.adt.TypeData` for its definition and documentation. - -Type Call -~~~~~~~~~ - -Because an ADT definition can take type parameters, Relay's type -system considers an ADT definition to be a *type-level function* -(in that the definition takes type parameters and returns the -type of an ADT instance with those type parameters). Thus, any -instance of an ADT is typed using a type call, which explicitly -lists the type parameters given to the ADT definition. - -It is important to list the type parameters for an ADT instance, -as two ADT instances built using different constructors but the -same type parameters are of the *same type* while two ADT instances -with different type parameters should not be considered the same -type (e.g., a list of integers should not have the same type as -a list of pairs of floating point tensors). - -The "function" in the type call is the ADT handle and there must -be one argument for each type parameter in the ADT definition. (An -ADT definition with no arguments means that any instance will have -no type arguments passed to the type call). - -See :py:class:`~tvm.relay.ty.TypeCall` for its definition and documentation. - -Example: List ADT -~~~~~~~~~~~~~~~~~ - -This subsection uses the simple list ADT (included as a default -ADT in Relay) to illustrate the constructs described in the previous -sections. Its definition is as follows: - -.. code-block:: - - data List { - Nil : () -> List - Cons : (a, List[a]) -> List - } - -Thus, the global type variable :code:`List` is the handle for the ADT. -The type data for the list ADT in the module notes that -:code:`List` takes one type parameter and has two constructors, -:code:`Nil` (with signature :code:`fn() -> List[a]`) -and :code:`Cons` (with signature :code:`fn(a, List[a]) -> List[a]`). -The recursive reference to :code:`List` in the :code:`Cons` -constructor is accomplished by using the global type -variable :code:`List` in the constructor definition. - -Below two instances of lists with their types given, using type calls: - -.. code-block:: - - Cons(1, Cons(2, Nil())) # List[Tensor[(), int32]] - Cons((1, 1), Cons((2, 2), Nil())) # List[(Tensor[(), int32], Tensor[(), int32])] - -Note that :code:`Nil()` can be an instance of any list because it -does not take any arguments that use a type parameter. (Nevertheless, -for any *particular* instance of :code:`Nil()`, the type parameter must -be specified.) - -Here are two lists that are rejected by the type system because -the type parameters do not match: - -.. code-block:: - - # attempting to put an integer on a list of int * int tuples - Cons(1, Cons((1, 1), Nil())) - # attempting to put a list of ints on a list of lists of int * int tuples - Cons(Cons(1, Cons(2, Nil())), Cons(Cons((1, 1), Cons((2, 2), Nil())), Nil())) diff --git a/gallery/how_to/compile_models/README.txt b/gallery/how_to/compile_models/README.txt deleted file mode 100644 index 61cc5be2cb79..000000000000 --- a/gallery/how_to/compile_models/README.txt +++ /dev/null @@ -1,7 +0,0 @@ -.. _tutorial-frontend: - -Compile Deep Learning Models -============================ - -TVM includes a variety of front-ends that can import models in different -formats. These how-tos demostrate how to import models using the Python API. diff --git a/gallery/how_to/compile_models/from_coreml.py b/gallery/how_to/compile_models/from_coreml.py deleted file mode 100644 index b54329920b8d..000000000000 --- a/gallery/how_to/compile_models/from_coreml.py +++ /dev/null @@ -1,114 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compile CoreML Models -===================== -**Author**: `Joshua Z. Zhang `_, \ - `Kazutaka Morita `_, \ - `Zhao Wu `_ - -This article is an introductory tutorial to deploy CoreML models with Relay. - -To begin, we must install coremltools: - -.. code-block:: bash - - %%shell - pip install coremltools - -or please refer to official site -https://github.com/apple/coremltools -""" - -import tvm -from tvm import te -import tvm.relay as relay -from tvm.contrib.download import download_testdata -import coremltools as cm -import numpy as np -from PIL import Image - -###################################################################### -# Load pretrained CoreML model -# ---------------------------- -# We will download and load a pretrained mobilenet classification network -# provided by apple in this example -model_url = "https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel" -model_file = "mobilenet.mlmodel" -model_path = download_testdata(model_url, model_file, module="coreml") -# Now you have mobilenet.mlmodel on disk -mlmodel = cm.models.MLModel(model_path) - -###################################################################### -# Load a test image -# ------------------ -# A single cat dominates the examples! -img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" -img_path = download_testdata(img_url, "cat.png", module="data") -img = Image.open(img_path).resize((224, 224)) -# Mobilenet.mlmodel's input is BGR format -img_bgr = np.array(img)[:, :, ::-1] -x = np.transpose(img_bgr, (2, 0, 1))[np.newaxis, :] - -###################################################################### -# Compile the model on Relay -# --------------------------- -# We should be familiar with the process right now. -target = "llvm" -shape_dict = {"image": x.shape} - -# Parse CoreML model and convert into Relay computation graph -mod, params = relay.frontend.from_coreml(mlmodel, shape_dict) - -with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target, params=params) - -###################################################################### -# Execute on TVM -# ------------------- -# The process is no different from other example -from tvm.contrib import graph_executor - -dev = tvm.cpu(0) -dtype = "float32" -m = graph_executor.GraphModule(lib["default"](dev)) -# set inputs -m.set_input("image", tvm.nd.array(x.astype(dtype))) -# execute -m.run() -# get outputs -tvm_output = m.get_output(0) -top1 = np.argmax(tvm_output.numpy()[0]) - -##################################################################### -# Look up synset name -# ------------------- -# Look up prediction top 1 index in 1000 class synset. -synset_url = "".join( - [ - "https://gist.githubusercontent.com/zhreshold/", - "4d0b62f3d01426887599d4f7ede23ee5/raw/", - "596b27d23537e5a1b5751d2b0481ef172f58b539/", - "imagenet1000_clsid_to_human.txt", - ] -) -synset_name = "imagenet1000_clsid_to_human.txt" -synset_path = download_testdata(synset_url, synset_name, module="data") -with open(synset_path) as f: - synset = eval(f.read()) -# You should see the following result: Top-1 id 282 class name tiger cat -print("Top-1 id", top1, "class name", synset[top1]) diff --git a/gallery/how_to/compile_models/from_darknet.py b/gallery/how_to/compile_models/from_darknet.py deleted file mode 100644 index ef0a8583777f..000000000000 --- a/gallery/how_to/compile_models/from_darknet.py +++ /dev/null @@ -1,203 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compile YOLO-V2 and YOLO-V3 in DarkNet Models -============================================= -**Author**: `Siju Samuel `_ - -This article is an introductory tutorial to deploy darknet models with TVM. -All the required models and libraries will be downloaded from the internet by the script. -This script runs the YOLO-V2 and YOLO-V3 Model with the bounding boxes -Darknet parsing have dependancy with CFFI and CV2 library -Please install CFFI and CV2 before executing this script - -.. code-block:: bash - - %%shell - pip install cffi opencv-python - -""" - -# numpy and matplotlib -import numpy as np -import matplotlib.pyplot as plt -import sys - -# tvm, relay -import tvm -from tvm import te -from tvm import relay -from ctypes import * -from tvm.contrib.download import download_testdata -from tvm.relay.testing.darknet import __darknetffi__ -import tvm.relay.testing.yolo_detection -import tvm.relay.testing.darknet - -###################################################################### -# Choose the model -# ----------------------- -# Models are: 'yolov2', 'yolov3' or 'yolov3-tiny' - -# Model name -MODEL_NAME = "yolov3" - -###################################################################### -# Download required files -# ----------------------- -# Download cfg and weights file if first time. -CFG_NAME = MODEL_NAME + ".cfg" -WEIGHTS_NAME = MODEL_NAME + ".weights" -REPO_URL = "https://github.com/dmlc/web-data/blob/main/darknet/" -CFG_URL = REPO_URL + "cfg/" + CFG_NAME + "?raw=true" -WEIGHTS_URL = "https://pjreddie.com/media/files/" + WEIGHTS_NAME - -cfg_path = download_testdata(CFG_URL, CFG_NAME, module="darknet") -weights_path = download_testdata(WEIGHTS_URL, WEIGHTS_NAME, module="darknet") - -# Download and Load darknet library -if sys.platform in ["linux", "linux2"]: - DARKNET_LIB = "libdarknet2.0.so" - DARKNET_URL = REPO_URL + "lib/" + DARKNET_LIB + "?raw=true" -elif sys.platform == "darwin": - DARKNET_LIB = "libdarknet_mac2.0.so" - DARKNET_URL = REPO_URL + "lib_osx/" + DARKNET_LIB + "?raw=true" -else: - err = "Darknet lib is not supported on {} platform".format(sys.platform) - raise NotImplementedError(err) - -lib_path = download_testdata(DARKNET_URL, DARKNET_LIB, module="darknet") - -DARKNET_LIB = __darknetffi__.dlopen(lib_path) -net = DARKNET_LIB.load_network(cfg_path.encode("utf-8"), weights_path.encode("utf-8"), 0) -dtype = "float32" -batch_size = 1 - -data = np.empty([batch_size, net.c, net.h, net.w], dtype) -shape_dict = {"data": data.shape} -print("Converting darknet to relay functions...") -mod, params = relay.frontend.from_darknet(net, dtype=dtype, shape=data.shape) - -###################################################################### -# Import the graph to Relay -# ------------------------- -# compile the model -target = tvm.target.Target("llvm", host="llvm") -dev = tvm.cpu(0) -data = np.empty([batch_size, net.c, net.h, net.w], dtype) -shape = {"data": data.shape} -print("Compiling the model...") -with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) - -[neth, netw] = shape["data"][2:] # Current image shape is 608x608 -###################################################################### -# Load a test image -# ----------------- -test_image = "dog.jpg" -print("Loading the test image...") -img_url = REPO_URL + "data/" + test_image + "?raw=true" -img_path = download_testdata(img_url, test_image, "data") - -data = tvm.relay.testing.darknet.load_image(img_path, netw, neth) -###################################################################### -# Execute on TVM Runtime -# ---------------------- -# The process is no different from other examples. -from tvm.contrib import graph_executor - -m = graph_executor.GraphModule(lib["default"](dev)) - -# set inputs -m.set_input("data", tvm.nd.array(data.astype(dtype))) -# execute -print("Running the test image...") - -# detection -# thresholds -thresh = 0.5 -nms_thresh = 0.45 - -m.run() -# get outputs -tvm_out = [] -if MODEL_NAME == "yolov2": - layer_out = {} - layer_out["type"] = "Region" - # Get the region layer attributes (n, out_c, out_h, out_w, classes, coords, background) - layer_attr = m.get_output(2).numpy() - layer_out["biases"] = m.get_output(1).numpy() - out_shape = (layer_attr[0], layer_attr[1] // layer_attr[0], layer_attr[2], layer_attr[3]) - layer_out["output"] = m.get_output(0).numpy().reshape(out_shape) - layer_out["classes"] = layer_attr[4] - layer_out["coords"] = layer_attr[5] - layer_out["background"] = layer_attr[6] - tvm_out.append(layer_out) - -elif MODEL_NAME == "yolov3": - for i in range(3): - layer_out = {} - layer_out["type"] = "Yolo" - # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total) - layer_attr = m.get_output(i * 4 + 3).numpy() - layer_out["biases"] = m.get_output(i * 4 + 2).numpy() - layer_out["mask"] = m.get_output(i * 4 + 1).numpy() - out_shape = (layer_attr[0], layer_attr[1] // layer_attr[0], layer_attr[2], layer_attr[3]) - layer_out["output"] = m.get_output(i * 4).numpy().reshape(out_shape) - layer_out["classes"] = layer_attr[4] - tvm_out.append(layer_out) - -elif MODEL_NAME == "yolov3-tiny": - for i in range(2): - layer_out = {} - layer_out["type"] = "Yolo" - # Get the yolo layer attributes (n, out_c, out_h, out_w, classes, total) - layer_attr = m.get_output(i * 4 + 3).numpy() - layer_out["biases"] = m.get_output(i * 4 + 2).numpy() - layer_out["mask"] = m.get_output(i * 4 + 1).numpy() - out_shape = (layer_attr[0], layer_attr[1] // layer_attr[0], layer_attr[2], layer_attr[3]) - layer_out["output"] = m.get_output(i * 4).numpy().reshape(out_shape) - layer_out["classes"] = layer_attr[4] - tvm_out.append(layer_out) - thresh = 0.560 - -# do the detection and bring up the bounding boxes -img = tvm.relay.testing.darknet.load_image_color(img_path) -_, im_h, im_w = img.shape -dets = tvm.relay.testing.yolo_detection.fill_network_boxes( - (netw, neth), (im_w, im_h), thresh, 1, tvm_out -) -last_layer = net.layers[net.n - 1] -tvm.relay.testing.yolo_detection.do_nms_sort(dets, last_layer.classes, nms_thresh) - -coco_name = "coco.names" -coco_url = REPO_URL + "data/" + coco_name + "?raw=true" -font_name = "arial.ttf" -font_url = REPO_URL + "data/" + font_name + "?raw=true" -coco_path = download_testdata(coco_url, coco_name, module="data") -font_path = download_testdata(font_url, font_name, module="data") - -with open(coco_path) as f: - content = f.readlines() - -names = [x.strip() for x in content] - -tvm.relay.testing.yolo_detection.show_detections(img, dets, thresh, names, last_layer.classes) -tvm.relay.testing.yolo_detection.draw_detections( - font_path, img, dets, thresh, names, last_layer.classes -) -plt.imshow(img.transpose(1, 2, 0)) -plt.show() diff --git a/gallery/how_to/compile_models/from_keras.py b/gallery/how_to/compile_models/from_keras.py deleted file mode 100644 index 3da674c25086..000000000000 --- a/gallery/how_to/compile_models/from_keras.py +++ /dev/null @@ -1,141 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compile Keras Models -===================== -**Author**: `Yuwei Hu `_ - -This article is an introductory tutorial to deploy Keras models with Relay. - -For us to begin with, keras should be installed. -Tensorflow is also required since it's used as the default backend of keras. - -A quick solution is to install via pip - -.. code-block:: bash - - %%shell - pip install keras tensorflow - -or please refer to official site -https://keras.io/#installation -""" - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import tvm -from tvm import te -import tvm.relay as relay -from tvm.contrib.download import download_testdata -import keras -import tensorflow as tf -import numpy as np - -###################################################################### -# Load pretrained keras model -# ---------------------------- -# We load a pretrained resnet-50 classification model provided by keras. - -if tuple(keras.__version__.split(".")) < ("2", "4", "0"): - weights_url = "".join( - [ - "https://github.com/fchollet/deep-learning-models/releases/", - "download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5", - ] - ) - weights_file = "resnet50_keras_old.h5" -else: - weights_url = "".join( - [ - " https://storage.googleapis.com/tensorflow/keras-applications/", - "resnet/resnet50_weights_tf_dim_ordering_tf_kernels.h5", - ] - ) - weights_file = "resnet50_keras_new.h5" - - -weights_path = download_testdata(weights_url, weights_file, module="keras") -keras_resnet50 = tf.keras.applications.resnet50.ResNet50( - include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000 -) -keras_resnet50.load_weights(weights_path) - -###################################################################### -# Load a test image -# ------------------ -# A single cat dominates the examples! -from PIL import Image -from matplotlib import pyplot as plt -from tensorflow.keras.applications.resnet50 import preprocess_input - -img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" -img_path = download_testdata(img_url, "cat.png", module="data") -img = Image.open(img_path).resize((224, 224)) -plt.imshow(img) -plt.show() -# input preprocess -data = np.array(img)[np.newaxis, :].astype("float32") -data = preprocess_input(data).transpose([0, 3, 1, 2]) -print("input_1", data.shape) - -###################################################################### -# Compile the model with Relay -# ---------------------------- -# convert the keras model(NHWC layout) to Relay format(NCHW layout). -shape_dict = {"input_1": data.shape} -mod, params = relay.frontend.from_keras(keras_resnet50, shape_dict) -# compile the model -target = "cuda" -dev = tvm.cuda(0) - -# TODO(mbs): opt_level=3 causes nn.contrib_conv2d_winograd_weight_transform -# to end up in the module which fails memory validation on cuda most likely -# due to a latent bug. Note that the pass context only has an effect within -# evaluate() and is not captured by create_executor(). -with tvm.transform.PassContext(opt_level=0): - model = relay.build_module.create_executor("graph", mod, dev, target, params).evaluate() - - -###################################################################### -# Execute on TVM -# --------------- -dtype = "float32" -tvm_out = model(tvm.nd.array(data.astype(dtype))) -top1_tvm = np.argmax(tvm_out.numpy()[0]) - -##################################################################### -# Look up synset name -# ------------------- -# Look up prediction top 1 index in 1000 class synset. -synset_url = "".join( - [ - "https://gist.githubusercontent.com/zhreshold/", - "4d0b62f3d01426887599d4f7ede23ee5/raw/", - "596b27d23537e5a1b5751d2b0481ef172f58b539/", - "imagenet1000_clsid_to_human.txt", - ] -) -synset_name = "imagenet1000_clsid_to_human.txt" -synset_path = download_testdata(synset_url, synset_name, module="data") -with open(synset_path) as f: - synset = eval(f.read()) -print("Relay top-1 id: {}, class name: {}".format(top1_tvm, synset[top1_tvm])) -# confirm correctness with keras output -keras_out = keras_resnet50.predict(data.transpose([0, 2, 3, 1])) -top1_keras = np.argmax(keras_out) -print("Keras top-1 id: {}, class name: {}".format(top1_keras, synset[top1_keras])) diff --git a/gallery/how_to/compile_models/from_onnx.py b/gallery/how_to/compile_models/from_onnx.py deleted file mode 100644 index c1f9be72c54c..000000000000 --- a/gallery/how_to/compile_models/from_onnx.py +++ /dev/null @@ -1,135 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compile ONNX Models -=================== -**Author**: `Joshua Z. Zhang `_ - -This article is an introductory tutorial to deploy ONNX models with Relay. - -To begin, install the ONNX package: - -.. code-block:: bash - - %%shell - pip install onnx onnxoptimizer - -Alternatively, you can refer to official site: -https://github.com/onnx/onnx -""" - -import onnx -import numpy as np -import tvm -from tvm import te -import tvm.relay as relay -from tvm.contrib.download import download_testdata - -###################################################################### -# Load pretrained ONNX model -# --------------------------------------------- -# The example super resolution model used here is exactly the same model in onnx tutorial -# http://pytorch.org/tutorials/advanced/super_resolution_with_caffe2.html -# we skip the pytorch model construction part, and download the saved onnx model -model_url = "".join( - [ - "https://gist.github.com/zhreshold/", - "bcda4716699ac97ea44f791c24310193/raw/", - "93672b029103648953c4e5ad3ac3aadf346a4cdc/", - "super_resolution_0.2.onnx", - ] -) -model_path = download_testdata(model_url, "super_resolution.onnx", module="onnx") -# now you have super_resolution.onnx on disk -onnx_model = onnx.load(model_path) - -###################################################################### -# Load a test image -# --------------------------------------------- -# A single cat dominates the examples! This model takes a single input image of size -# 224x224 and outputs a scaled image that is 3x greater than the input along each -# axis, a 672x672 image. Re-scale the cat image to fit this input shape then -# convert to `YCbCr`. The super resolution model will then be applied to the -# luminance (`Y`) channel. -from PIL import Image - -img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" -img_path = download_testdata(img_url, "cat.png", module="data") -img = Image.open(img_path).resize((224, 224)) -img_ycbcr = img.convert("YCbCr") # convert to YCbCr -img_y, img_cb, img_cr = img_ycbcr.split() -x = np.array(img_y)[np.newaxis, np.newaxis, :, :] - -###################################################################### -# Compile the model with relay -# --------------------------------------------- -# Typically ONNX models mix model input values with parameter values, with -# the input having the name `1`. This model dependent, and you should check -# with the documentation for your model to determine the full input and -# parameter name space. -# -# Passing in the shape dictionary to the `relay.frontend.from_onnx` method -# tells relay which ONNX parameters are inputs, and which are parameters, and -# provides a static definition of the input size. -target = "llvm" - -input_name = "1" -shape_dict = {input_name: x.shape} -mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) - -with tvm.transform.PassContext(opt_level=1): - executor = relay.build_module.create_executor( - "graph", mod, tvm.cpu(0), target, params - ).evaluate() - -###################################################################### -# Execute on TVM -# --------------------------------------------- -dtype = "float32" -tvm_output = executor(tvm.nd.array(x.astype(dtype))).numpy() - -###################################################################### -# Display results -# --------------------------------------------- -# We put input and output image neck to neck. The luminance channel, `Y` is the output -# from the model. The chroma channels `Cb` and `Cr` are resized to match with a simple -# bicubic algorithm. The image is then recombined and converted back to `RGB`. -from matplotlib import pyplot as plt - -out_y = Image.fromarray(np.uint8((tvm_output[0, 0]).clip(0, 255)), mode="L") -out_cb = img_cb.resize(out_y.size, Image.BICUBIC) -out_cr = img_cr.resize(out_y.size, Image.BICUBIC) -result = Image.merge("YCbCr", [out_y, out_cb, out_cr]).convert("RGB") -canvas = np.full((672, 672 * 2, 3), 255) -canvas[0:224, 0:224, :] = np.asarray(img) -canvas[:, 672:, :] = np.asarray(result) -plt.imshow(canvas.astype(np.uint8)) -plt.show() - -###################################################################### -# Notes -# --------------------------------------------- -# By default, ONNX defines models in terms of dynamic shapes. The ONNX importer -# retains that dynamism upon import, and the compiler attempts to convert the model -# into a static shapes at compile time. If this fails, there may still be dynamic -# operations in the model. Not all TVM kernels currently support dynamic shapes, -# please file an issue on discuss.tvm.apache.org if you hit an error with dynamic kernels. -# -# This particular model was build using an older version of ONNX. During the import -# phase ONNX importer will run the ONNX verifier, which may throw a `Mismatched attribute type` -# warning. Because TVM supports a number of different ONNX versions, the Relay model -# will still be valid. diff --git a/gallery/how_to/compile_models/from_paddle.py b/gallery/how_to/compile_models/from_paddle.py deleted file mode 100644 index 5e78c8c3b06c..000000000000 --- a/gallery/how_to/compile_models/from_paddle.py +++ /dev/null @@ -1,118 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compile PaddlePaddle Models -=========================== -**Author**: `Ziyuan Ma `_ - -This article is an introductory tutorial to deploy PaddlePaddle models with Relay. -To begin, we'll install PaddlePaddle>=2.1.3: - -.. code-block:: bash - - %%shell - pip install paddlepaddle -i https://mirror.baidu.com/pypi/simple - -For more details, refer to the official install instructions at: -https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html -""" - -import tarfile -import paddle -import numpy as np -import tvm -from tvm import relay -from tvm.contrib.download import download_testdata - -###################################################################### -# Load pretrained ResNet50 model -# --------------------------------------------- -# We load a pretrained ResNet50 provided by PaddlePaddle. -url = "https://bj.bcebos.com/x2paddle/models/paddle_resnet50.tar" -model_path = download_testdata(url, "paddle_resnet50.tar", module="model") - -with tarfile.open(model_path) as tar: - names = tar.getnames() - for name in names: - tar.extract(name, "./") - -model = paddle.jit.load("./paddle_resnet50/model") - -###################################################################### -# Load a test image -# --------------------------------------------- -# A single cat dominates the examples! - -from PIL import Image -import paddle.vision.transforms as T - - -transforms = T.Compose( - [ - T.Resize((256, 256)), - T.CenterCrop(224), - T.ToTensor(), - T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ] -) - -img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" -img_path = download_testdata(img_url, "cat.png", module="data") -img = Image.open(img_path).resize((224, 224)) - -img = transforms(img) -img = np.expand_dims(img, axis=0) - -###################################################################### -# Compile the model with relay -# --------------------------------------------- - -target = "llvm" -shape_dict = {"inputs": img.shape} -mod, params = relay.frontend.from_paddle(model, shape_dict) - -with tvm.transform.PassContext(opt_level=3): - executor = relay.build_module.create_executor( - "graph", mod, tvm.cpu(0), target, params - ).evaluate() - -###################################################################### -# Execute on TVM -# --------------------------------------------- -dtype = "float32" -tvm_output = executor(tvm.nd.array(img.astype(dtype))).numpy() - -###################################################################### -# Look up synset name -# --------------------------------------------- -# Look up prediction top 1 index in 1000 class synset. - -synset_url = "".join( - [ - "https://gist.githubusercontent.com/zhreshold/", - "4d0b62f3d01426887599d4f7ede23ee5/raw/", - "596b27d23537e5a1b5751d2b0481ef172f58b539/", - "imagenet1000_clsid_to_human.txt", - ] -) -synset_name = "imagenet1000_clsid_to_human.txt" -synset_path = download_testdata(synset_url, synset_name, module="data") -with open(synset_path) as f: - synset = f.readlines() - -top1 = np.argmax(tvm_output[0]) -print(f"TVM prediction top-1 id: {top1}, class name: {synset[top1]}") diff --git a/gallery/how_to/compile_models/from_pytorch.py b/gallery/how_to/compile_models/from_pytorch.py deleted file mode 100644 index 14c264b9f4ac..000000000000 --- a/gallery/how_to/compile_models/from_pytorch.py +++ /dev/null @@ -1,171 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compile PyTorch Models -====================== -**Author**: `Alex Wong `_ - -This article is an introductory tutorial to deploy PyTorch models with Relay. - -For us to begin, PyTorch should be installed. -TorchVision is also required so we can use the model zoo. -A quick solution is to install via pip: - -.. code-block:: bash - - %%shell - pip install torch - pip install torchvision - -or please refer to official site -https://pytorch.org/get-started/locally/ - -PyTorch versions should be backwards compatible but should be used -with the proper TorchVision version. - -Currently, TVM supports PyTorch 1.7 and 1.4. Other versions may -be unstable. -""" - -import tvm -from tvm import relay - -import numpy as np - -from tvm.contrib.download import download_testdata - -# PyTorch imports -import torch -import torchvision - -###################################################################### -# Load a pretrained PyTorch model -# ------------------------------- -model_name = "resnet18" -model = getattr(torchvision.models, model_name)(pretrained=True) -model = model.eval() - -# We grab the TorchScripted model via tracing -input_shape = [1, 3, 224, 224] -input_data = torch.randn(input_shape) -scripted_model = torch.jit.trace(model, input_data).eval() - -###################################################################### -# Load a test image -# ----------------- -# Classic cat example! -from PIL import Image - -img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" -img_path = download_testdata(img_url, "cat.png", module="data") -img = Image.open(img_path).resize((224, 224)) - -# Preprocess the image and convert to tensor -from torchvision import transforms - -my_preprocess = transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ] -) -img = my_preprocess(img) -img = np.expand_dims(img, 0) - -###################################################################### -# Import the graph to Relay -# ------------------------- -# Convert PyTorch graph to Relay graph. The input name can be arbitrary. -input_name = "input0" -shape_list = [(input_name, img.shape)] -mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) - -###################################################################### -# Relay Build -# ----------- -# Compile the graph to llvm target with given input specification. -target = tvm.target.Target("llvm", host="llvm") -dev = tvm.cpu(0) -with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) - -###################################################################### -# Execute the portable graph on TVM -# --------------------------------- -# Now we can try deploying the compiled model on target. -from tvm.contrib import graph_executor - -dtype = "float32" -m = graph_executor.GraphModule(lib["default"](dev)) -# Set inputs -m.set_input(input_name, tvm.nd.array(img.astype(dtype))) -# Execute -m.run() -# Get outputs -tvm_output = m.get_output(0) - -##################################################################### -# Look up synset name -# ------------------- -# Look up prediction top 1 index in 1000 class synset. -synset_url = "".join( - [ - "https://raw.githubusercontent.com/Cadene/", - "pretrained-models.pytorch/master/data/", - "imagenet_synsets.txt", - ] -) -synset_name = "imagenet_synsets.txt" -synset_path = download_testdata(synset_url, synset_name, module="data") -with open(synset_path) as f: - synsets = f.readlines() - -synsets = [x.strip() for x in synsets] -splits = [line.split(" ") for line in synsets] -key_to_classname = {spl[0]: " ".join(spl[1:]) for spl in splits} - -class_url = "".join( - [ - "https://raw.githubusercontent.com/Cadene/", - "pretrained-models.pytorch/master/data/", - "imagenet_classes.txt", - ] -) -class_name = "imagenet_classes.txt" -class_path = download_testdata(class_url, class_name, module="data") -with open(class_path) as f: - class_id_to_key = f.readlines() - -class_id_to_key = [x.strip() for x in class_id_to_key] - -# Get top-1 result for TVM -top1_tvm = np.argmax(tvm_output.numpy()[0]) -tvm_class_key = class_id_to_key[top1_tvm] - -# Convert input to PyTorch variable and get PyTorch result for comparison -with torch.no_grad(): - torch_img = torch.from_numpy(img) - output = model(torch_img) - - # Get top-1 result for PyTorch - top1_torch = np.argmax(output.numpy()) - torch_class_key = class_id_to_key[top1_torch] - -print("Relay top-1 id: {}, class name: {}".format(top1_tvm, key_to_classname[tvm_class_key])) -print("Torch top-1 id: {}, class name: {}".format(top1_torch, key_to_classname[torch_class_key])) diff --git a/gallery/how_to/compile_models/from_tensorflow.py b/gallery/how_to/compile_models/from_tensorflow.py deleted file mode 100644 index 741d98109450..000000000000 --- a/gallery/how_to/compile_models/from_tensorflow.py +++ /dev/null @@ -1,257 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compile Tensorflow Models -========================= -This article is an introductory tutorial to deploy tensorflow models with TVM. - -For us to begin with, tensorflow python module is required to be installed. - -.. code-block:: bash - - %%shell - pip install tensorflow - -Please refer to https://www.tensorflow.org/install -""" - -# tvm, relay -import tvm -from tvm import te -from tvm import relay - -# os and numpy -import numpy as np -import os.path - -# Tensorflow imports -import tensorflow as tf - - -# Ask tensorflow to limit its GPU memory to what's actually needed -# instead of gobbling everything that's available. -# https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth -# This way this tutorial is a little more friendly to sphinx-gallery. -gpus = tf.config.list_physical_devices("GPU") -if gpus: - try: - for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu, True) - print("tensorflow will use experimental.set_memory_growth(True)") - except RuntimeError as e: - print("experimental.set_memory_growth option is not available: {}".format(e)) - - -try: - tf_compat_v1 = tf.compat.v1 -except ImportError: - tf_compat_v1 = tf - -# Tensorflow utility functions -import tvm.relay.testing.tf as tf_testing - -# Base location for model related files. -repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/" - -# Test image -img_name = "elephant-299.jpg" -image_url = os.path.join(repo_base, img_name) - -###################################################################### -# Tutorials -# --------- -# Please refer docs/frontend/tensorflow.md for more details for various models -# from tensorflow. - -model_name = "classify_image_graph_def-with_shapes.pb" -model_url = os.path.join(repo_base, model_name) - -# Image label map -map_proto = "imagenet_2012_challenge_label_map_proto.pbtxt" -map_proto_url = os.path.join(repo_base, map_proto) - -# Human readable text for labels -label_map = "imagenet_synset_to_human_label_map.txt" -label_map_url = os.path.join(repo_base, label_map) - -# Target settings -# Use these commented settings to build for cuda. -# target = tvm.target.Target("cuda", host="llvm") -# layout = "NCHW" -# dev = tvm.cuda(0) -target = tvm.target.Target("llvm", host="llvm") -layout = None -dev = tvm.cpu(0) - -###################################################################### -# Download required files -# ----------------------- -# Download files listed above. -from tvm.contrib.download import download_testdata - -img_path = download_testdata(image_url, img_name, module="data") -model_path = download_testdata(model_url, model_name, module=["tf", "InceptionV1"]) -map_proto_path = download_testdata(map_proto_url, map_proto, module="data") -label_path = download_testdata(label_map_url, label_map, module="data") - -###################################################################### -# Import model -# ------------ -# Creates tensorflow graph definition from protobuf file. - -with tf_compat_v1.gfile.GFile(model_path, "rb") as f: - graph_def = tf_compat_v1.GraphDef() - graph_def.ParseFromString(f.read()) - graph = tf.import_graph_def(graph_def, name="") - # Call the utility to import the graph definition into default graph. - graph_def = tf_testing.ProcessGraphDefParam(graph_def) - # Add shapes to the graph. - with tf_compat_v1.Session() as sess: - graph_def = tf_testing.AddShapesToGraphDef(sess, "softmax") - -###################################################################### -# Decode image -# ------------ -# .. note:: -# -# tensorflow frontend import doesn't support preprocessing ops like JpegDecode. -# JpegDecode is bypassed (just return source node). -# Hence we supply decoded frame to TVM instead. -# - -from PIL import Image - -image = Image.open(img_path).resize((299, 299)) - -x = np.array(image) - -###################################################################### -# Import the graph to Relay -# ------------------------- -# Import tensorflow graph definition to relay frontend. -# -# Results: -# sym: relay expr for given tensorflow protobuf. -# params: params converted from tensorflow params (tensor protobuf). -shape_dict = {"DecodeJpeg/contents": x.shape} -dtype_dict = {"DecodeJpeg/contents": "uint8"} -mod, params = relay.frontend.from_tensorflow(graph_def, layout=layout, shape=shape_dict) - -print("Tensorflow protobuf imported to relay frontend.") -###################################################################### -# Relay Build -# ----------- -# Compile the graph to llvm target with given input specification. -# -# Results: -# graph: Final graph after compilation. -# params: final params after compilation. -# lib: target library which can be deployed on target with TVM runtime. - -with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target, params=params) - -###################################################################### -# Execute the portable graph on TVM -# --------------------------------- -# Now we can try deploying the compiled model on target. - -from tvm.contrib import graph_executor - -dtype = "uint8" -m = graph_executor.GraphModule(lib["default"](dev)) -# set inputs -m.set_input("DecodeJpeg/contents", tvm.nd.array(x.astype(dtype))) -# execute -m.run() -# get outputs -tvm_output = m.get_output(0, tvm.nd.empty(((1, 1008)), "float32")) - -###################################################################### -# Process the output -# ------------------ -# Process the model output to human readable text for InceptionV1. -predictions = tvm_output.numpy() -predictions = np.squeeze(predictions) - -# Creates node ID --> English string lookup. -node_lookup = tf_testing.NodeLookup(label_lookup_path=map_proto_path, uid_lookup_path=label_path) - -# Print top 5 predictions from TVM output. -top_k = predictions.argsort()[-5:][::-1] -for node_id in top_k: - human_string = node_lookup.id_to_string(node_id) - score = predictions[node_id] - print("%s (score = %.5f)" % (human_string, score)) - -###################################################################### -# Inference on tensorflow -# ----------------------- -# Run the corresponding model on tensorflow - - -def create_graph(): - """Creates a graph from saved GraphDef file and returns a saver.""" - # Creates graph from saved graph_def.pb. - with tf_compat_v1.gfile.GFile(model_path, "rb") as f: - graph_def = tf_compat_v1.GraphDef() - graph_def.ParseFromString(f.read()) - graph = tf.import_graph_def(graph_def, name="") - # Call the utility to import the graph definition into default graph. - graph_def = tf_testing.ProcessGraphDefParam(graph_def) - - -def run_inference_on_image(image): - """Runs inference on an image. - - Parameters - ---------- - image: String - Image file name. - - Returns - ------- - Nothing - """ - if not tf_compat_v1.gfile.Exists(image): - tf.logging.fatal("File does not exist %s", image) - image_data = tf_compat_v1.gfile.GFile(image, "rb").read() - - # Creates graph from saved GraphDef. - create_graph() - - with tf_compat_v1.Session() as sess: - softmax_tensor = sess.graph.get_tensor_by_name("softmax:0") - predictions = sess.run(softmax_tensor, {"DecodeJpeg/contents:0": image_data}) - - predictions = np.squeeze(predictions) - - # Creates node ID --> English string lookup. - node_lookup = tf_testing.NodeLookup( - label_lookup_path=map_proto_path, uid_lookup_path=label_path - ) - - # Print top 5 predictions from tensorflow. - top_k = predictions.argsort()[-5:][::-1] - print("===== TENSORFLOW RESULTS =======") - for node_id in top_k: - human_string = node_lookup.id_to_string(node_id) - score = predictions[node_id] - print("%s (score = %.5f)" % (human_string, score)) - - -run_inference_on_image(img_path) diff --git a/gallery/how_to/compile_models/from_tflite.py b/gallery/how_to/compile_models/from_tflite.py deleted file mode 100644 index 226e67c82e89..000000000000 --- a/gallery/how_to/compile_models/from_tflite.py +++ /dev/null @@ -1,193 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compile TFLite Models -===================== -**Author**: `Zhao Wu `_ - -This article is an introductory tutorial to deploy TFLite models with Relay. - -To get started, TFLite package needs to be installed as prerequisite. - -.. code-block:: bash - - %%shell - pip install tflite==2.1.0 - -or you could generate TFLite package yourself. The steps are the following: - -.. code-block:: bash - - # Get the flatc compiler. - # Please refer to https://github.com/google/flatbuffers for details - # and make sure it is properly installed. - flatc --version - - # Get the TFLite schema. - wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.13/tensorflow/lite/schema/schema.fbs - - # Generate TFLite package. - flatc --python schema.fbs - - # Add current folder (which contains generated tflite module) to PYTHONPATH. - export PYTHONPATH=${PYTHONPATH:+$PYTHONPATH:}$(pwd) - - -Now please check if TFLite package is installed successfully, ``python -c "import tflite"`` - -Below you can find an example on how to compile TFLite model using TVM. -""" - -###################################################################### -# Utils for downloading and extracting zip files -# ---------------------------------------------- - -import os - - -def extract(path): - import tarfile - - if path.endswith("tgz") or path.endswith("gz"): - dir_path = os.path.dirname(path) - tar = tarfile.open(path) - tar.extractall(path=dir_path) - tar.close() - else: - raise RuntimeError("Could not decompress the file: " + path) - - -###################################################################### -# Load pretrained TFLite model -# ---------------------------- -# Load mobilenet V1 TFLite model provided by Google -from tvm.contrib.download import download_testdata - -model_url = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz" - -# Download model tar file and extract it to get mobilenet_v1_1.0_224.tflite -model_path = download_testdata(model_url, "mobilenet_v1_1.0_224.tgz", module=["tf", "official"]) -model_dir = os.path.dirname(model_path) -extract(model_path) - -# Now we can open mobilenet_v1_1.0_224.tflite -tflite_model_file = os.path.join(model_dir, "mobilenet_v1_1.0_224.tflite") -tflite_model_buf = open(tflite_model_file, "rb").read() - -# Get TFLite model from buffer -try: - import tflite - - tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0) -except AttributeError: - import tflite.Model - - tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) - -###################################################################### -# Load a test image -# ----------------- -# A single cat dominates the examples! -from PIL import Image -from matplotlib import pyplot as plt -import numpy as np - -image_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" -image_path = download_testdata(image_url, "cat.png", module="data") -resized_image = Image.open(image_path).resize((224, 224)) -plt.imshow(resized_image) -plt.show() -image_data = np.asarray(resized_image).astype("float32") - -# Add a dimension to the image so that we have NHWC format layout -image_data = np.expand_dims(image_data, axis=0) - -# Preprocess image as described here: -# https://github.com/tensorflow/models/blob/edb6ed22a801665946c63d650ab9a0b23d98e1b1/research/slim/preprocessing/inception_preprocessing.py#L243 -image_data[:, :, :, 0] = 2.0 / 255.0 * image_data[:, :, :, 0] - 1 -image_data[:, :, :, 1] = 2.0 / 255.0 * image_data[:, :, :, 1] - 1 -image_data[:, :, :, 2] = 2.0 / 255.0 * image_data[:, :, :, 2] - 1 -print("input", image_data.shape) - -###################################################################### -# Compile the model with relay -# ---------------------------- - -# TFLite input tensor name, shape and type -input_tensor = "input" -input_shape = (1, 224, 224, 3) -input_dtype = "float32" - -# Parse TFLite model and convert it to a Relay module -from tvm import relay, transform - -mod, params = relay.frontend.from_tflite( - tflite_model, shape_dict={input_tensor: input_shape}, dtype_dict={input_tensor: input_dtype} -) - -# Build the module against to x86 CPU -target = "llvm" -with transform.PassContext(opt_level=3): - lib = relay.build(mod, target, params=params) - -###################################################################### -# Execute on TVM -# -------------- -import tvm -from tvm import te -from tvm.contrib import graph_executor as runtime - -# Create a runtime executor module -module = runtime.GraphModule(lib["default"](tvm.cpu())) - -# Feed input data -module.set_input(input_tensor, tvm.nd.array(image_data)) - -# Run -module.run() - -# Get output -tvm_output = module.get_output(0).numpy() - -###################################################################### -# Display results -# --------------- - -# Load label file -label_file_url = "".join( - [ - "https://raw.githubusercontent.com/", - "tensorflow/tensorflow/master/tensorflow/lite/java/demo/", - "app/src/main/assets/", - "labels_mobilenet_quant_v1_224.txt", - ] -) -label_file = "labels_mobilenet_quant_v1_224.txt" -label_path = download_testdata(label_file_url, label_file, module="data") - -# List of 1001 classes -with open(label_path) as f: - labels = f.readlines() - -# Convert result to 1D data -predictions = np.squeeze(tvm_output) - -# Get top 1 prediction -prediction = np.argmax(predictions) - -# Convert id to class name and show the result -print("The image prediction result is: id " + str(prediction) + " name: " + labels[prediction]) diff --git a/gallery/how_to/deploy_models/README.txt b/gallery/how_to/deploy_models/README.txt deleted file mode 100644 index 22323c4e521b..000000000000 --- a/gallery/how_to/deploy_models/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -Deploy Deep Learning Models ---------------------------- - -TVM is capable of deploying models to a variety of different platforms. These -how-tos describe how to prepapre and deploy models to many of the supported -backends. diff --git a/gallery/how_to/deploy_models/deploy_model_on_adreno.py b/gallery/how_to/deploy_models/deploy_model_on_adreno.py deleted file mode 100644 index 0ea76bb288f8..000000000000 --- a/gallery/how_to/deploy_models/deploy_model_on_adreno.py +++ /dev/null @@ -1,473 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -.. _tutorial-deploy-model-on-adreno: - -Deploy the Pretrained Model on Adreno™ -====================================== -**Author**: Daniil Barinov, Siva Rama Krishna - -This article is a step-by-step tutorial to deploy pretrained Pytorch ResNet-18 model on Adreno (on different precisions). - -For us to begin with, PyTorch must be installed. -TorchVision is also required since we will be using it as our model zoo. - -A quick solution is to install it via pip: - -.. code-block:: bash - - %%shell - pip install torch - pip install torchvision - -Besides that, you should have TVM builded for Android. -See the following instructions on how to build it. - -`Deploy to Adreno GPU `_ - -After the build section there should be two files in *build* directory «libtvm_runtime.so» and «tvm_rpc». -Let's push them to the device and run TVM RPC Server. -""" - -###################################################################### -# TVM RPC Server -# -------------- -# To get the hash of the device use: -# -# .. code-block:: bash -# -# adb devices -# -# Set the android device to use, if you have several devices connected to your computer. -# -# .. code-block:: bash -# -# export ANDROID_SERIAL= -# -# Then to upload these two files to the device you should use: -# -# .. code-block:: bash -# -# adb push {libtvm_runtime.so,tvm_rpc} /data/local/tmp -# -# At this moment you will have «libtvm_runtime.so» and «tvm_rpc» on path /data/local/tmp on your device. -# Sometimes cmake can’t find «libc++_shared.so». Use: -# -# .. code-block:: bash -# -# find ${ANDROID_NDK_HOME} -name libc++_shared.so -# -# to find it and also push it with adb on the desired device: -# -# .. code-block:: bash -# -# adb push libc++_shared.so /data/local/tmp -# -# We are now ready to run the TVM RPC Server. -# Launch rpc_tracker with following line in 1st console: -# -# .. code-block:: bash -# -# python3 -m tvm.exec.rpc_tracker --port 9190 -# -# Then we need to run tvm_rpc server from under the desired device in 2nd console: -# -# .. code-block:: bash -# -# adb reverse tcp:9190 tcp:9190 -# adb forward tcp:5000 tcp:5000 -# adb forward tcp:5002 tcp:5001 -# adb forward tcp:5003 tcp:5002 -# adb forward tcp:5004 tcp:5003 -# adb shell LD_LIBRARY_PATH=/data/local/tmp /data/local/tmp/tvm_rpc server --host=0.0.0.0 --port=5000 --tracker=127.0.0.1:9190 --key=android --port-end=5100 -# -# Before proceeding to compile and infer model, specify TVM_TRACKER_HOST and TVM_TRACKER_PORT -# -# .. code-block:: bash -# -# export TVM_TRACKER_HOST=0.0.0.0 -# export TVM_TRACKER_PORT=9190 -# -# check that the tracker is running and the device is available -# -# .. code-block:: bash -# -# python -m tvm.exec.query_rpc_tracker --port 9190 -# -# For example, if we have 1 Android device, -# the output can be: -# -# .. code-block:: bash -# -# Queue Status -# ---------------------------------- -# key total free pending -# ---------------------------------- -# android 1 1 0 -# ---------------------------------- - -################################################################# -# Configuration -# ------------- - -import os -import torch -import torchvision -import tvm -from tvm import te -from tvm import relay, rpc -from tvm.contrib import utils, ndk -from tvm.contrib import graph_executor -from tvm.relay.op.contrib import clml -from tvm import autotvm - -# Below are set of configuration that controls the behaviour of this script like -# local run or device run, target definitions, dtype setting and auto tuning enablement. -# Change these settings as needed if required. - -# Adreno devices are efficient with float16 compared to float32 -# Given the expected output doesn't effect by lowering precision -# it's advisable to use lower precision. -# We have a helper API to make the precision conversion simple and -# it supports dtype with "float16" and "float16_acc32" modes. -# Let's choose "float16" for calculation and "float32" for accumulation. - -calculation_dtype = "float16" -acc_dtype = "float32" - -# Specify Adreno target before compiling to generate texture -# leveraging kernels and get all the benefits of textures -# Note: This generated example running on our x86 server for demonstration. -# If running it on the Android device, we need to -# specify its instruction set. Set :code:`local_demo` to False if you want -# to run this tutorial with a real device over rpc. -local_demo = True - -# by default on CPU target will execute. -# select 'cpu', 'opencl' and 'opencl -device=adreno' -test_target = "cpu" - -# Change target configuration. -# Run `adb shell cat /proc/cpuinfo` to find the arch. -arch = "arm64" -target = tvm.target.Target("llvm -mtriple=%s-linux-android" % arch) - -# Auto tuning is compute intensive and time taking task, -# hence disabling for default run. Please enable it if required. -is_tuning = False -tune_log = "adreno-resnet18.log" - -# To enable OpenCLML accelerated operator library. -enable_clml = False - -################################################################# -# Get a PyTorch Model -# ------------------- -# Get resnet18 from torchvision models -model_name = "resnet18" -model = getattr(torchvision.models, model_name)(pretrained=True) -model = model.eval() - -# We grab the TorchScripted model via tracing -input_shape = [1, 3, 224, 224] -input_data = torch.randn(input_shape) -scripted_model = torch.jit.trace(model, input_data).eval() - -################################################################# -# Load a test image -# ----------------- -# As an example we would use classical cat image from ImageNet - -from PIL import Image -from tvm.contrib.download import download_testdata -from matplotlib import pyplot as plt -import numpy as np - -img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" -img_path = download_testdata(img_url, "cat.png", module="data") -img = Image.open(img_path).resize((224, 224)) -plt.imshow(img) -plt.show() - -# Preprocess the image and convert to tensor -from torchvision import transforms - -my_preprocess = transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), - ] -) -img = my_preprocess(img) -img = np.expand_dims(img, 0) - -################################################################# -# Convert PyTorch model to Relay module -# ------------------------------------- -# TVM has frontend api for various frameworks under relay.frontend and now -# for pytorch model import we have relay.frontend.from_pytorch api. -# Input name can be arbitrary -input_name = "input0" -shape_list = [(input_name, img.shape)] - -mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) - -################################################################# -# Precisions -# ---------- - -# Adreno devices are efficient with float16 compared to float32 -# Given the expected output doesn't effect by lowering precision -# it's advisable to use lower precision. - -# TVM support Mixed Precision through ToMixedPrecision transformation pass. -# We may need to register precision rules like precision type, accumultation -# datatype ...etc. for the required operators to override the default settings. -# The below helper api simplifies the precision conversions across the module. - -# Calculation dtype is set to "float16" and accumulation dtype is set to "float32" -# in configuration section above. - -from tvm.driver.tvmc.transform import apply_graph_transforms - -mod = apply_graph_transforms( - mod, - { - "mixed_precision": True, - "mixed_precision_ops": ["nn.conv2d", "nn.dense"], - "mixed_precision_calculation_type": calculation_dtype, - "mixed_precision_acc_type": acc_dtype, - }, -) - -################################################################# -# As you can see in the IR, the architecture now contains cast operations, which are -# needed to convert to FP16 precision. -# You can also use "float16" or "float32" precisions as other dtype options. - -################################################################# -# Prepare TVM Target -# ------------------ - -# This generated example running on our x86 server for demonstration. - -# To deply and tun on real target over RPC please set :code:`local_demo` to False in above configuration sestion. -# Also, :code:`test_target` is set to :code:`llvm` as this example to make compatible for x86 demonstration. -# Please change it to :code:`opencl` or :code:`opencl -device=adreno` for RPC target in configuration above. - -if local_demo: - target = tvm.target.Target("llvm") -elif test_target.find("opencl"): - target = tvm.target.Target(test_target, host=target) - -################################################################## -# AutoTuning -# ---------- -# The below few instructions can auto tune the relay module with xgboost being the tuner algorithm. - -# Auto Tuning process involces stages of extracting the tasks, defining tuning congiguration and -# tuning each task for best performing kernel configuration. - -# Get RPC related settings. -rpc_tracker_host = os.environ.get("TVM_TRACKER_HOST", "127.0.0.1") -rpc_tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190)) -key = "android" - -# Auto tuning is compute intensive and time taking task. -# It is set to False in above configuration as this script runs in x86 for demonstration. -# Please to set :code:`is_tuning` to True to enable auto tuning. - -if is_tuning: - # Auto Tuning Stage 1: Extract tunable tasks - tasks = autotvm.task.extract_from_program( - mod, target=test_target, target_host=target, params=params - ) - - # Auto Tuning Stage 2: Define tuning configuration - tmp_log_file = tune_log + ".tmp" - measure_option = autotvm.measure_option( - builder=autotvm.LocalBuilder( - build_func=ndk.create_shared, timeout=15 - ), # Build the test kernel locally - runner=autotvm.RPCRunner( # The runner would be on a remote device. - key, # RPC Key - host=rpc_tracker_host, # Tracker host - port=int(rpc_tracker_port), # Tracker port - number=3, # Number of runs before averaging - timeout=600, # RPC Timeout - ), - ) - n_trial = 1024 # Number of iteration of training before choosing the best kernel config - early_stopping = False # Can be enabled to stop tuning while the loss is not minimizing. - - # Auto Tuning Stage 3: Iterate through the tasks and tune. - from tvm.autotvm.tuner import XGBTuner - - for i, tsk in enumerate(reversed(tasks[:3])): - print("Task:", tsk) - prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) - - # choose tuner - tuner = "xgb" - - # create tuner - if tuner == "xgb": - tuner_obj = XGBTuner(tsk, loss_type="reg") - elif tuner == "xgb_knob": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob") - elif tuner == "xgb_itervar": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar") - elif tuner == "xgb_curve": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve") - elif tuner == "xgb_rank": - tuner_obj = XGBTuner(tsk, loss_type="rank") - elif tuner == "xgb_rank_knob": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob") - elif tuner == "xgb_rank_itervar": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar") - elif tuner == "xgb_rank_curve": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve") - elif tuner == "xgb_rank_binary": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary") - elif tuner == "xgb_rank_binary_knob": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob") - elif tuner == "xgb_rank_binary_itervar": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar") - elif tuner == "xgb_rank_binary_curve": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve") - elif tuner == "ga": - tuner_obj = GATuner(tsk, pop_size=50) - elif tuner == "random": - tuner_obj = RandomTuner(tsk) - elif tuner == "gridsearch": - tuner_obj = GridSearchTuner(tsk) - else: - raise ValueError("Invalid tuner: " + tuner) - - tsk_trial = min(n_trial, len(tsk.config_space)) - tuner_obj.tune( - n_trial=tsk_trial, - early_stopping=early_stopping, - measure_option=measure_option, - callbacks=[ - autotvm.callback.progress_bar(tsk_trial, prefix=prefix), - autotvm.callback.log_to_file(tmp_log_file), - ], - ) - # Auto Tuning Stage 4: Pick the best performing configurations from the overall log. - autotvm.record.pick_best(tmp_log_file, tune_log) - -################################################################# -# Enable OpenCLML Offloading -# -------------------------- -# OpenCLML offloading will try to accelerate supported operators -# by using OpenCLML proprietory operator library. - -# By default :code:`enable_clml` is set to False in above configuration section. - -if not local_demo and enable_clml: - mod = clml.partition_for_clml(mod, params) - -################################################################# -# Compilation -# ----------- -# Use tuning cache if exists. -if os.path.exists(tune_log): - with autotvm.apply_history_best(tune_log): - with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) -else: - with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) - -################################################################# -# Deploy the Model Remotely by RPC -# -------------------------------- -# Using RPC you can deploy the model from host -# machine to the remote Adreno device -if local_demo: - remote = rpc.LocalSession() -else: - tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port) - # When running a heavy model, we should increase the `session_timeout` - remote = tracker.request(key, priority=0, session_timeout=60) - -if local_demo: - dev = remote.cpu(0) -elif test_target.find("opencl"): - dev = remote.cl(0) -else: - dev = remote.cpu(0) - -temp = utils.tempdir() -dso_binary = "dev_lib_cl.so" -dso_binary_path = temp.relpath(dso_binary) -fcompile = ndk.create_shared if not local_demo else None -lib.export_library(dso_binary_path, fcompile=fcompile) -remote_path = "/data/local/tmp/" + dso_binary -remote.upload(dso_binary_path) -rlib = remote.load_module(dso_binary) -m = graph_executor.GraphModule(rlib["default"](dev)) - -################################################################# -# Run inference -# ------------- -# We now can set inputs, infer our model and get predictions as output -m.set_input(input_name, tvm.nd.array(img.astype("float32"))) -m.run() -tvm_output = m.get_output(0) - -################################################################# -# Get predictions and performance statistic -# ----------------------------------------- -# This piece of code displays the top-1 and top-5 predictions, as -# well as provides information about the model's performance -from os.path import join, isfile -from matplotlib import pyplot as plt -from tvm.contrib import download - -# Download ImageNet categories -categ_url = "https://github.com/uwsampl/web-data/raw/main/vta/models/" -categ_fn = "synset.txt" -download.download(join(categ_url, categ_fn), categ_fn) -synset = eval(open(categ_fn).read()) - -top_categories = np.argsort(tvm_output.asnumpy()[0]) -top5 = np.flip(top_categories, axis=0)[:5] - -# Report top-1 classification result -print("Top-1 id: {}, class name: {}".format(top5[1 - 1], synset[top5[1 - 1]])) - -# Report top-5 classification results -print("\nTop5 predictions: \n") -print("\t#1:", synset[top5[1 - 1]]) -print("\t#2:", synset[top5[2 - 1]]) -print("\t#3:", synset[top5[3 - 1]]) -print("\t#4:", synset[top5[4 - 1]]) -print("\t#5:", synset[top5[5 - 1]]) -print("\t", top5) -ImageNetClassifier = False -for k in top_categories[-5:]: - if "cat" in synset[k]: - ImageNetClassifier = True -assert ImageNetClassifier, "Failed ImageNet classifier validation check" - -print("Evaluate inference time cost...") -print(m.benchmark(dev, number=1, repeat=10)) diff --git a/gallery/how_to/deploy_models/deploy_model_on_adreno_tvmc.py b/gallery/how_to/deploy_models/deploy_model_on_adreno_tvmc.py deleted file mode 100644 index 0e037e9f912f..000000000000 --- a/gallery/how_to/deploy_models/deploy_model_on_adreno_tvmc.py +++ /dev/null @@ -1,212 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -.. _tutorial-deploy-model-on-adreno-tvmc: - -Deploy the Pretrained Model on Adreno™ with tvmc Interface -========================================================== -**Author**: Siva Rama Krishna - -This article is a step-by-step tutorial to deploy pretrained PyTorch resnet50 model on Adreno™. - -Besides that, you should have TVM built for Android. -See the following instructions on how to build it and setup RPC environment. - -`Deploy to Adreno GPU `_ - -""" - -import os -import tvm -import numpy as np -from tvm import relay -from tvm.driver import tvmc -from tvm.driver.tvmc.model import TVMCPackage -from tvm.contrib import utils - -################################################################# -# Configuration -# ------------- -# Specify Adreno target before compiling to generate texture -# leveraging kernels and get all the benefits of textures -# Note: This generated example running on our x86 server for demonstration. -# If running it on the Android device, we need to -# specify its instruction set. Set :code:`local_demo` to False if you want -# to run this tutorial with a real device over rpc. -local_demo = True - -# by default on CPU target will execute. -# select 'llvm', 'opencl' and 'opencl -device=adreno' -target = "llvm" - -# Change target configuration. -# Run `adb shell cat /proc/cpuinfo` to find the arch. -arch = "arm64" -target_host = "llvm -mtriple=%s-linux-android" % arch - -# Auto tuning is compute and time taking task, hence disabling for default run. Please enable it if required. -is_tuning = False -tune_log = "adreno-resnet50.log" - -# To enable OpenCLML accelerated operator library. -enable_clml = False -cross_compiler = ( - os.getenv("ANDROID_NDK_HOME", "") - + "/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android28-clang" -) - -####################################################################### -# Make a PyTorch Resnet50 Model -# ----------------------------- - -import torch -import torchvision.models as models - -# Load the ResNet50 model pre-trained on ImageNet -model = models.resnet50(pretrained=True) - -# Set the model to evaluation mode -model.eval() - -# Define the input shape -dummy_input = torch.randn(1, 3, 224, 224) - -# Trace the model -traced_model = torch.jit.trace(model, dummy_input) - -# Save the traced model -model_file_name = "resnet50_traced.pt" -traced_model.save(model_file_name) - - -####################################################################### -# Load Model -# ---------- -# Convert a model from any framework to a tvm relay module. -# tvmc.load supports models from any framework (like tensorflow saves_model, onnx, tflite ..etc) and auto detects the filetype. - -input_shape = (1, 3, 224, 224) # Batch size, channels, height, width - -# Load the TorchScript model with TVMC -tvmc_model = tvmc.load(model_file_name, shape_dict={"input": input_shape}, model_format="pytorch") - -print(tvmc_model.mod) - -# tvmc_model consists of tvmc_mode.mod which is relay module and tvmc_model.params which parms of the module. - -####################################################################### -# AutoTuning -# ---------- -# Now, the below api can be used for autotuning the model for any target. -# Tuning required RPC setup and please refer to -# `Deploy to Adreno GPU `_ - -rpc_tracker_host = os.environ.get("TVM_TRACKER_HOST", "127.0.0.1") -rpc_tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190)) -rpc_key = "android" -rpc_tracker = rpc_tracker_host + ":" + str(rpc_tracker_port) - -# Auto tuning is compute intensive and time taking task. -# It is set to False in above configuration as this script runs in x86 for demonstration. -# Please to set :code:`is_tuning` to True to enable auto tuning. - -# Also, :code:`test_target` is set to :code:`llvm` as this example to make compatible for x86 demonstration. -# Please change it to :code:`opencl` or :code:`opencl -device=adreno` for RPC target in configuration above. - -if is_tuning: - tvmc.tune( - tvmc_model, - target=target, - tuning_records=tune_log, - target_host=target_host, - hostname=rpc_tracker_host, - port=rpc_tracker_port, - rpc_key=rpc_key, - tuner="xgb", - repeat=30, - trials=3, - early_stopping=0, - ) - -####################################################################### -# Compilation -# ----------- -# Compilation to produce tvm artifacts - -# This generated example running on our x86 server for demonstration. -# To deply and tun on real target over RPC please set :code:`local_demo` to False in above configuration sestion. - -# OpenCLML offloading will try to accelerate supported operators by using OpenCLML proprietory operator library. -# By default :code:`enable_clml` is set to False in above configuration section. - -if not enable_clml: - if local_demo: - tvmc_package = tvmc.compile( - tvmc_model, - target=target, - ) - else: - tvmc_package = tvmc.compile( - tvmc_model, - target=target, - target_host=target_host, - cross=cross_compiler, - tuning_records=tune_log, - ) -else: - # Altrernatively, we can save the compilation output and save it as a TVMCPackage. - # This way avoids loading of compiled module without compiling again. - target = target + ", clml" - pkg_path = tmp_path.relpath("torch-resnet50.tar") - tvmc.compile( - tvmc_model, - target=target, - target_host=target_host, - cross=cross_compiler, - tuning_records=tune_log, - package_path=pkg_path, - ) - - # Load the compiled package - tvmc_package = TVMCPackage(package_path=pkg_path) - -# tvmc_package consists of tvmc_package.lib_path, tvmc_package.graph, tvmc_package.params -# Saved TVMPackage is nothing but tar archive with mod.so, mod.json and mod.params. - - -####################################################################### -# Deploy & Run -# ------------ -# Deploy and run the compiled model on RPC -# Let tvmc fill inputs using random - -# Run on RPC setup -if local_demo: - result = tvmc.run(tvmc_package, device="cpu", fill_mode="random") -else: - result = tvmc.run( - tvmc_package, - device="cl", - rpc_key=rpc_key, - hostname=rpc_tracker_host, - port=rpc_tracker_port, - fill_mode="random", - ) - -# result is a dictionary of outputs. -print("Result:", result) diff --git a/gallery/how_to/deploy_models/deploy_model_on_android.py b/gallery/how_to/deploy_models/deploy_model_on_android.py deleted file mode 100644 index 6ed69e64d49e..000000000000 --- a/gallery/how_to/deploy_models/deploy_model_on_android.py +++ /dev/null @@ -1,360 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -.. _tutorial-deploy-model-on-android: - -Deploy the Pretrained Model on Android -======================================= -**Author**: `Tomohiro Kato `_ - -This is an example of using Relay to compile a keras model and deploy it on Android device. -""" - - -import os -import numpy as np -from PIL import Image -import keras -from keras.applications.mobilenet_v2 import MobileNetV2 -import tvm -from tvm import te -import tvm.relay as relay -from tvm import rpc -from tvm.contrib import utils, ndk, graph_executor as runtime -from tvm.contrib.download import download_testdata - - -###################################################################### -# Setup Environment -# ----------------- -# Since there are many required packages for Android, it is recommended to use the official Docker Image. -# -# First, to build and run Docker Image, we can run the following command. -# -# .. code-block:: bash -# -# git clone --recursive https://github.com/apache/tvm tvm -# cd tvm -# docker build -t tvm.demo_android -f docker/Dockerfile.demo_android ./docker -# docker run --pid=host -h tvm -v $PWD:/workspace \ -# -w /workspace -p 9190:9190 --name tvm -it tvm.demo_android bash -# -# You are now inside the container. The cloned TVM directory is mounted on /workspace. -# At this time, mount the 9190 port used by RPC described later. -# -# .. note:: -# -# Please execute the following steps in the container. -# We can execute :code:`docker exec -it tvm bash` to open a new terminal in the container. -# -# Next we build the TVM. -# -# .. code-block:: bash -# -# mkdir build -# cd build -# cmake -DUSE_LLVM=llvm-config-8 \ -# -DUSE_RPC=ON \ -# -DUSE_SORT=ON \ -# -DUSE_VULKAN=ON \ -# -DUSE_GRAPH_EXECUTOR=ON \ -# .. -# make -j10 -# -# After building TVM successfully, Please set PYTHONPATH. -# -# .. code-block:: bash -# -# echo 'export PYTHONPATH=/workspace/python:/workspace/vta/python:${PYTHONPATH}' >> ~/.bashrc -# source ~/.bashrc - -################################################################# -# Start RPC Tracker -# ----------------- -# TVM uses RPC session to communicate with Android device. -# -# To start an RPC tracker, run this command in the container. The tracker is -# required during the whole tuning process, so we need to open a new terminal for -# this command: -# -# .. code-block:: bash -# -# python3 -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190 -# -# The expected output is -# -# .. code-block:: bash -# -# INFO:RPCTracker:bind to 0.0.0.0:9190 - -################################################################# -# Register Android device to RPC Tracker -# -------------------------------------- -# Now we can register our Android device to the tracker. -# -# Follow this `readme page `_ to -# install TVM RPC APK on the android device. -# -# Here is an example of config.mk. I enabled OpenCL and Vulkan. -# -# -# .. code-block:: bash -# -# APP_ABI = arm64-v8a -# -# APP_PLATFORM = android-24 -# -# # whether enable OpenCL during compile -# USE_OPENCL = 1 -# -# # whether to enable Vulkan during compile -# USE_VULKAN = 1 -# -# ifeq ($(USE_VULKAN), 1) -# # Statically linking vulkan requires API Level 24 or higher -# APP_PLATFORM = android-24 -# endif -# -# # the additional include headers you want to add, e.g., SDK_PATH/adrenosdk/Development/Inc -# ADD_C_INCLUDES += /work/adrenosdk-linux-5_0/Development/Inc -# ADD_C_INCLUDES = -# -# # the additional link libs you want to add, e.g., ANDROID_LIB_PATH/libOpenCL.so -# ADD_LDLIBS = -# -# .. note:: -# -# At this time, don't forget to `create a standalone toolchain `_ . -# -# for example -# -# .. code-block:: bash -# -# $ANDROID_NDK_HOME/build/tools/make-standalone-toolchain.sh \ -# --platform=android-24 --use-llvm --arch=arm64 --install-dir=/opt/android-toolchain-arm64 -# export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++ -# -# Next, start the Android application and enter the IP address and port of RPC Tracker. -# Then you have already registered your device. -# -# After registering devices, we can confirm it by querying rpc_tracker -# -# .. code-block:: bash -# -# python3 -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 -# -# For example, if we have 1 Android device. -# the output can be -# -# .. code-block:: bash -# -# Queue Status -# ---------------------------------- -# key total free pending -# ---------------------------------- -# android 1 1 0 -# ---------------------------------- -# -# To confirm that you can communicate with Android, we can run following test script. -# If you use OpenCL and Vulkan, please set :code:`test_opencl` and :code:`test_vulkan` in the script. -# -# .. code-block:: bash -# -# export TVM_TRACKER_HOST=0.0.0.0 -# export TVM_TRACKER_PORT=9190 -# -# .. code-block:: bash -# -# cd /workspace/apps/android_rpc -# python3 tests/android_rpc_test.py -# - -###################################################################### -# Load pretrained keras model -# --------------------------- -# We load a pretrained MobileNetV2(alpha=0.5) classification model provided by keras. -keras.backend.clear_session() # Destroys the current TF graph and creates a new one. -weights_url = "".join( - [ - "https://github.com/JonathanCMitchell/", - "mobilenet_v2_keras/releases/download/v1.1/", - "mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.5_224.h5", - ] -) -weights_file = "mobilenet_v2_weights.h5" -weights_path = download_testdata(weights_url, weights_file, module="keras") -keras_mobilenet_v2 = MobileNetV2( - alpha=0.5, include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000 -) -keras_mobilenet_v2.load_weights(weights_path) - -###################################################################### -# In order to test our model, here we download an image of cat and -# transform its format. -img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" -img_name = "cat.png" -img_path = download_testdata(img_url, img_name, module="data") -image = Image.open(img_path).resize((224, 224)) -dtype = "float32" - - -def transform_image(image): - image = np.array(image) - np.array([123.0, 117.0, 104.0]) - image /= np.array([58.395, 57.12, 57.375]) - image = image.transpose((2, 0, 1)) - image = image[np.newaxis, :] - return image - - -x = transform_image(image) - -###################################################################### -# synset is used to transform the label from number of ImageNet class to -# the word human can understand. -synset_url = "".join( - [ - "https://gist.githubusercontent.com/zhreshold/", - "4d0b62f3d01426887599d4f7ede23ee5/raw/", - "596b27d23537e5a1b5751d2b0481ef172f58b539/", - "imagenet1000_clsid_to_human.txt", - ] -) -synset_name = "imagenet1000_clsid_to_human.txt" -synset_path = download_testdata(synset_url, synset_name, module="data") -with open(synset_path) as f: - synset = eval(f.read()) - - -###################################################################### -# Compile the model with relay -# ---------------------------- -# If we run the example on our x86 server for demonstration, we can simply -# set it as :code:`llvm`. If running it on the Android device, we need to -# specify its instruction set. Set :code:`local_demo` to False if you want -# to run this tutorial with a real device. - -local_demo = True - -# by default on CPU target will execute. -# select 'cpu', 'opencl' and 'vulkan' -test_target = "cpu" - -# Change target configuration. -# Run `adb shell cat /proc/cpuinfo` to find the arch. -arch = "arm64" -target = tvm.target.Target("llvm -mtriple=%s-linux-android" % arch) - -if local_demo: - target = tvm.target.Target("llvm") -elif test_target == "opencl": - target = tvm.target.Target("opencl", host=target) -elif test_target == "vulkan": - target = tvm.target.Target("vulkan", host=target) - -input_name = "input_1" -shape_dict = {input_name: x.shape} -mod, params = relay.frontend.from_keras(keras_mobilenet_v2, shape_dict) - -with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) - -# After `relay.build`, you will get three return values: graph, -# library and the new parameter, since we do some optimization that will -# change the parameters but keep the result of model as the same. - -# Save the library at local temporary directory. -tmp = utils.tempdir() -lib_fname = tmp.relpath("net.so") -fcompile = ndk.create_shared if not local_demo else None -lib.export_library(lib_fname, fcompile=fcompile) - -###################################################################### -# Deploy the Model Remotely by RPC -# -------------------------------- -# With RPC, you can deploy the model remotely from your host machine -# to the remote android device. - -tracker_host = os.environ.get("TVM_TRACKER_HOST", "127.0.0.1") -tracker_port = int(os.environ.get("TVM_TRACKER_PORT", 9190)) -key = "android" - -if local_demo: - remote = rpc.LocalSession() -else: - tracker = rpc.connect_tracker(tracker_host, tracker_port) - # When running a heavy model, we should increase the `session_timeout` - remote = tracker.request(key, priority=0, session_timeout=60) - -if local_demo: - dev = remote.cpu(0) -elif test_target == "opencl": - dev = remote.cl(0) -elif test_target == "vulkan": - dev = remote.vulkan(0) -else: - dev = remote.cpu(0) - -# upload the library to remote device and load it -remote.upload(lib_fname) -rlib = remote.load_module("net.so") - -# create the remote runtime module -module = runtime.GraphModule(rlib["default"](dev)) - -###################################################################### -# Execute on TVM -# -------------- - -# set input data -module.set_input(input_name, tvm.nd.array(x.astype(dtype))) -# run -module.run() -# get output -out = module.get_output(0) - -# get top1 result -top1 = np.argmax(out.numpy()) -print("TVM prediction top-1: {}".format(synset[top1])) - -print("Evaluate inference time cost...") -print(module.benchmark(dev, number=1, repeat=10)) - -###################################################################### -# Sample Output -# ------------- -# The following is the result of 'cpu', 'opencl' and 'vulkan' using Adreno 530 on Snapdragon 820 -# -# Although we can run on a GPU, it is slower than CPU. -# To speed up, we need to write and optimize the schedule according to the GPU architecture. -# -# .. code-block:: bash -# -# # cpu -# TVM prediction top-1: tiger cat -# Evaluate inference time cost... -# Mean inference time (std dev): 37.92 ms (19.67 ms) -# -# # opencl -# TVM prediction top-1: tiger cat -# Evaluate inference time cost... -# Mean inference time (std dev): 419.83 ms (7.49 ms) -# -# # vulkan -# TVM prediction top-1: tiger cat -# Evaluate inference time cost... -# Mean inference time (std dev): 465.80 ms (4.52 ms) diff --git a/gallery/how_to/deploy_models/deploy_model_on_nano.py b/gallery/how_to/deploy_models/deploy_model_on_nano.py deleted file mode 100644 index a65615954697..000000000000 --- a/gallery/how_to/deploy_models/deploy_model_on_nano.py +++ /dev/null @@ -1,251 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-deploy-model-on-nano: - -Deploy the Pretrained Model on Jetson Nano -=========================================== -**Author**: `BBuf `_ - -This is an example of using Relay to compile a ResNet model and deploy -it on Jetson Nano. -""" - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import tvm -from tvm import te -import tvm.relay as relay -from tvm import rpc -from tvm.contrib import utils, graph_executor as runtime -from tvm.contrib.download import download_testdata - -###################################################################### -# .. _build-tvm-runtime-on-jetson-nano: -# -# Build TVM Runtime on Jetson Nano -# -------------------------------- -# -# The first step is to build the TVM runtime on the remote device. -# -# .. note:: -# -# All instructions in both this section and next section should be -# executed on the target device, e.g. Jetson Nano. And we assume it -# has Linux running. -# -# Since we do compilation on local machine, the remote device is only used -# for running the generated code. We only need to build tvm runtime on -# the remote device. -# -# .. code-block:: bash -# -# git clone --recursive https://github.com/apache/tvm tvm -# cd tvm -# mkdir build -# cp cmake/config.cmake build -# cd build -# cmake .. -# make runtime -j4 -# .. note:: -# -# If we want to use Jetson Nano's GPU for inference, -# we need to enable the CUDA option in `config.cmake`, -# that is, `set(USE_CUDA ON)` -# -# After building runtime successfully, we need to set environment varibles -# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc` -# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM -# directory is in :code:`~/tvm`): -# -# .. code-block:: bash -# -# export PYTHONPATH=$PYTHONPATH:~/tvm/python -# -# To update the environment variables, execute :code:`source ~/.bashrc`. - -###################################################################### -# Set Up RPC Server on Device -# --------------------------- -# To start an RPC server, run the following command on your remote device -# (Which is Jetson Nano in our example). -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9091 -# -# If you see the line below, it means the RPC server started -# successfully on your device. -# -# .. code-block:: bash -# -# INFO:RPCServer:bind to 0.0.0.0:9091 -# - -###################################################################### -# Prepare the Pre-trained Model -# ----------------------------- -# Back to the host machine, which should have a full TVM installed (with LLVM). -# -# We will use pre-trained model from torchvision - -import torch -import torchvision -from PIL import Image -import numpy as np - -# one line to get the model -model_name = "resnet18" -model = getattr(torchvision.models, model_name)(pretrained=True) -model = model.eval() - -# We grab the TorchScripted model via tracing -input_shape = [1, 3, 224, 224] -input_data = torch.randn(input_shape) -scripted_model = torch.jit.trace(model, input_data).eval() - -###################################################################### -# In order to test our model, here we download an image of cat and -# transform its format. -img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" -img_name = "cat.png" -img_path = download_testdata(img_url, img_name, module="data") -image = Image.open(img_path).resize((224, 224)) - - -def transform_image(image): - image = np.array(image) - np.array([123.0, 117.0, 104.0]) - image /= np.array([58.395, 57.12, 57.375]) - image = image.transpose((2, 0, 1)) - image = image[np.newaxis, :] - return image - - -x = transform_image(image) - -###################################################################### -# synset is used to transform the label from number of ImageNet class to -# the word human can understand. -synset_url = "".join( - [ - "https://gist.githubusercontent.com/zhreshold/", - "4d0b62f3d01426887599d4f7ede23ee5/raw/", - "596b27d23537e5a1b5751d2b0481ef172f58b539/", - "imagenet1000_clsid_to_human.txt", - ] -) -synset_name = "imagenet1000_clsid_to_human.txt" -synset_path = download_testdata(synset_url, synset_name, module="data") -with open(synset_path) as f: - synset = eval(f.read()) - -###################################################################### -# Now we would like to port the Gluon model to a portable computational graph. -# It's as easy as several lines. - -input_name = "input0" -shape_list = [(input_name, x.shape)] -mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) -# we want a probability so add a softmax operator -func = mod["main"] -func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs) - -###################################################################### -# Here are some basic data workload configurations. -batch_size = 1 -num_classes = 1000 -image_shape = (3, 224, 224) -data_shape = (batch_size,) + image_shape - -###################################################################### -# Compile The Graph -# ----------------- -# To compile the graph, we call the :py:func:`relay.build` function -# with the graph configuration and parameters. However, You cannot to -# deploy a x86 program on a device with ARM instruction set. It means -# Relay also needs to know the compilation option of target device, -# apart from arguments :code:`net` and :code:`params` to specify the -# deep learning workload. Actually, the option matters, different option -# will lead to very different performance. - -###################################################################### -# If we run the example on our x86 server for demonstration, we can simply -# set it as :code:`llvm`. If running it on the Jetson Nano, we need to -# set it as :code:`nvidia/jetson-nano`. Set :code:`local_demo` to False -# if you want to run this tutorial with a real device. - -local_demo = True - -if local_demo: - target = tvm.target.Target("llvm") -else: - target = tvm.target.Target("nvidia/jetson-nano") - assert target.kind.name == "cuda" - assert target.attrs["arch"] == "sm_53" - assert target.attrs["shared_memory_per_block"] == 49152 - assert target.attrs["max_threads_per_block"] == 1024 - assert target.attrs["thread_warp_size"] == 32 - assert target.attrs["registers_per_block"] == 32768 - -with tvm.transform.PassContext(opt_level=3): - lib = relay.build(func, target, params=params) - -# After `relay.build`, you will get three return values: graph, -# library and the new parameter, since we do some optimization that will -# change the parameters but keep the result of model as the same. - -# Save the library at local temporary directory. -tmp = utils.tempdir() -lib_fname = tmp.relpath("net.tar") -lib.export_library(lib_fname) - -###################################################################### -# Deploy the Model Remotely by RPC -# -------------------------------- -# With RPC, you can deploy the model remotely from your host machine -# to the remote device. - -# obtain an RPC session from remote device. -if local_demo: - remote = rpc.LocalSession() -else: - # The following is my environment, change this to the IP address of your target device - host = "192.168.1.11" - port = 9091 - remote = rpc.connect(host, port) - -# upload the library to remote device and load it -remote.upload(lib_fname) -rlib = remote.load_module("net.tar") - -# create the remote runtime module -if local_demo: - dev = remote.cpu(0) -else: - dev = remote.cuda(0) - -module = runtime.GraphModule(rlib["default"](dev)) -# set input data -module.set_input(input_name, tvm.nd.array(x.astype("float32"))) -# run -module.run() -# get output -out = module.get_output(0) -# get top1 result -top1 = np.argmax(out.numpy()) -print("TVM prediction top-1: {}".format(synset[top1])) diff --git a/gallery/how_to/deploy_models/deploy_model_on_rasp.py b/gallery/how_to/deploy_models/deploy_model_on_rasp.py deleted file mode 100644 index 64f83dbbc0f8..000000000000 --- a/gallery/how_to/deploy_models/deploy_model_on_rasp.py +++ /dev/null @@ -1,236 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-deploy-model-on-rasp: - -Deploy the Pretrained Model on Raspberry Pi -=========================================== -**Author**: `Ziheng Jiang `_, \ - `Hiroyuki Makino `_ - -This is an example of using Relay to compile a ResNet model and deploy -it on Raspberry Pi. -""" - -import tvm -from tvm import te -import tvm.relay as relay -from tvm import rpc -from tvm.contrib import utils, graph_executor as runtime -from tvm.contrib.download import download_testdata - -###################################################################### -# .. _build-tvm-runtime-on-device: -# -# Build TVM Runtime on Device -# --------------------------- -# -# The first step is to build the TVM runtime on the remote device. -# -# .. note:: -# -# All instructions in both this section and next section should be -# executed on the target device, e.g. Raspberry Pi. And we assume it -# has Linux running. -# -# Since we do compilation on local machine, the remote device is only used -# for running the generated code. We only need to build tvm runtime on -# the remote device. -# -# .. code-block:: bash -# -# git clone --recursive https://github.com/apache/tvm tvm -# cd tvm -# mkdir build -# cp cmake/config.cmake build -# cd build -# cmake .. -# make runtime -j4 -# -# After building runtime successfully, we need to set environment varibles -# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc` -# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM -# directory is in :code:`~/tvm`): -# -# .. code-block:: bash -# -# export PYTHONPATH=$PYTHONPATH:~/tvm/python -# -# To update the environment variables, execute :code:`source ~/.bashrc`. - -###################################################################### -# Set Up RPC Server on Device -# --------------------------- -# To start an RPC server, run the following command on your remote device -# (Which is Raspberry Pi in our example). -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090 -# -# If you see the line below, it means the RPC server started -# successfully on your device. -# -# .. code-block:: bash -# -# INFO:root:RPCServer: bind to 0.0.0.0:9090 -# - -###################################################################### -# Prepare the Pre-trained Model -# ----------------------------- -# Back to the host machine, which should have a full TVM installed (with LLVM). -# -# We will use pre-trained model from torchvision - -import torch -import torchvision -from PIL import Image -import numpy as np - -# one line to get the model -model_name = "resnet18" -model = getattr(torchvision.models, model_name)(pretrained=True) -model = model.eval() - -# We grab the TorchScripted model via tracing -input_shape = [1, 3, 224, 224] -input_data = torch.randn(input_shape) -scripted_model = torch.jit.trace(model, input_data).eval() - -###################################################################### -# In order to test our model, here we download an image of cat and -# transform its format. -img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" -img_name = "cat.png" -img_path = download_testdata(img_url, img_name, module="data") -image = Image.open(img_path).resize((224, 224)) - - -def transform_image(image): - image = np.array(image) - np.array([123.0, 117.0, 104.0]) - image /= np.array([58.395, 57.12, 57.375]) - image = image.transpose((2, 0, 1)) - image = image[np.newaxis, :] - return image - - -x = transform_image(image) - -###################################################################### -# synset is used to transform the label from number of ImageNet class to -# the word human can understand. -synset_url = "".join( - [ - "https://gist.githubusercontent.com/zhreshold/", - "4d0b62f3d01426887599d4f7ede23ee5/raw/", - "596b27d23537e5a1b5751d2b0481ef172f58b539/", - "imagenet1000_clsid_to_human.txt", - ] -) -synset_name = "imagenet1000_clsid_to_human.txt" -synset_path = download_testdata(synset_url, synset_name, module="data") -with open(synset_path) as f: - synset = eval(f.read()) - -###################################################################### -# Now we would like to port the PyTorch model to a portable computational graph. -# It's as easy as several lines. - -input_name = "input0" -shape_list = [(input_name, x.shape)] -mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) -# we want a probability so add a softmax operator -func = mod["main"] -func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs) - -###################################################################### -# Here are some basic data workload configurations. -batch_size = 1 -num_classes = 1000 -image_shape = (3, 224, 224) -data_shape = (batch_size,) + image_shape - -###################################################################### -# Compile The Graph -# ----------------- -# To compile the graph, we call the :py:func:`relay.build` function -# with the graph configuration and parameters. However, You cannot to -# deploy a x86 program on a device with ARM instruction set. It means -# Relay also needs to know the compilation option of target device, -# apart from arguments :code:`net` and :code:`params` to specify the -# deep learning workload. Actually, the option matters, different option -# will lead to very different performance. - -###################################################################### -# If we run the example on our x86 server for demonstration, we can simply -# set it as :code:`llvm`. If running it on the Raspberry Pi, we need to -# specify its instruction set. Set :code:`local_demo` to False if you want -# to run this tutorial with a real device. - -local_demo = True - -if local_demo: - target = tvm.target.Target("llvm") -else: - target = tvm.target.arm_cpu("rasp3b") - # The above line is a simple form of - # target = tvm.target.Target('llvm -device=arm_cpu -model=bcm2837 -mtriple=armv7l-linux-gnueabihf -mattr=+neon') - -with tvm.transform.PassContext(opt_level=3): - lib = relay.build(func, target, params=params) - -# After `relay.build`, you will get three return values: graph, -# library and the new parameter, since we do some optimization that will -# change the parameters but keep the result of model as the same. - -# Save the library at local temporary directory. -tmp = utils.tempdir() -lib_fname = tmp.relpath("net.tar") -lib.export_library(lib_fname) - -###################################################################### -# Deploy the Model Remotely by RPC -# -------------------------------- -# With RPC, you can deploy the model remotely from your host machine -# to the remote device. - -# obtain an RPC session from remote device. -if local_demo: - remote = rpc.LocalSession() -else: - # The following is my environment, change this to the IP address of your target device - host = "10.77.1.162" - port = 9090 - remote = rpc.connect(host, port) - -# upload the library to remote device and load it -remote.upload(lib_fname) -rlib = remote.load_module("net.tar") - -# create the remote runtime module -dev = remote.cpu(0) -module = runtime.GraphModule(rlib["default"](dev)) -# set input data -module.set_input(input_name, tvm.nd.array(x.astype("float32"))) -# run -module.run() -# get output -out = module.get_output(0) -# get top1 result -top1 = np.argmax(out.numpy()) -print("TVM prediction top-1: {}".format(synset[top1])) diff --git a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py b/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py deleted file mode 100644 index 8400e82b4215..000000000000 --- a/gallery/how_to/deploy_models/deploy_object_detection_pytorch.py +++ /dev/null @@ -1,153 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compile PyTorch Object Detection Models -======================================= -This article is an introductory tutorial to deploy PyTorch object -detection models with Relay VM. - -For us to begin with, PyTorch should be installed. -TorchVision is also required since we will be using it as our model zoo. - -A quick solution is to install via pip - -.. code-block:: bash - - pip install torch - pip install torchvision - -or please refer to official site -https://pytorch.org/get-started/locally/ - -PyTorch versions should be backwards compatible but should be used -with the proper TorchVision version. - -Currently, TVM supports PyTorch 1.7 and 1.4. Other versions may -be unstable. -""" - -import tvm -from tvm import relay -from tvm import relay -from tvm.runtime.vm import VirtualMachine -from tvm.contrib.download import download_testdata - -import numpy as np -import cv2 - -# PyTorch imports -import torch -import torchvision - -###################################################################### -# Load pre-trained maskrcnn from torchvision and do tracing -# --------------------------------------------------------- -in_size = 300 - -input_shape = (1, 3, in_size, in_size) - - -def do_trace(model, inp): - model_trace = torch.jit.trace(model, inp) - model_trace.eval() - return model_trace - - -def dict_to_tuple(out_dict): - if "masks" in out_dict.keys(): - return out_dict["boxes"], out_dict["scores"], out_dict["labels"], out_dict["masks"] - return out_dict["boxes"], out_dict["scores"], out_dict["labels"] - - -class TraceWrapper(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - - def forward(self, inp): - out = self.model(inp) - return dict_to_tuple(out[0]) - - -model_func = torchvision.models.detection.maskrcnn_resnet50_fpn -model = TraceWrapper(model_func(pretrained=True)) - -model.eval() -inp = torch.Tensor(np.random.uniform(0.0, 250.0, size=(1, 3, in_size, in_size))) - -with torch.no_grad(): - out = model(inp) - script_module = do_trace(model, inp) - -###################################################################### -# Download a test image and pre-process -# ------------------------------------- -img_url = ( - "https://raw.githubusercontent.com/dmlc/web-data/master/gluoncv/detection/street_small.jpg" -) -img_path = download_testdata(img_url, "test_street_small.jpg", module="data") - -img = cv2.imread(img_path).astype("float32") -img = cv2.resize(img, (in_size, in_size)) -img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) -img = np.transpose(img / 255.0, [2, 0, 1]) -img = np.expand_dims(img, axis=0) - -###################################################################### -# Import the graph to Relay -# ------------------------- -input_name = "input0" -shape_list = [(input_name, input_shape)] -mod, params = relay.frontend.from_pytorch(script_module, shape_list) - -###################################################################### -# Compile with Relay VM -# --------------------- -# Note: Currently only CPU target is supported. For x86 target, it is -# highly recommended to build TVM with Intel MKL and Intel OpenMP to get -# best performance, due to the existence of large dense operator in -# torchvision rcnn models. - -# Add "-libs=mkl" to get best performance on x86 target. -# For x86 machine supports AVX512, the complete target is -# "llvm -mcpu=skylake-avx512 -libs=mkl" -target = "llvm" - -with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]): - vm_exec = relay.vm.compile(mod, target=target, params=params) - -###################################################################### -# Inference with Relay VM -# ----------------------- -dev = tvm.cpu() -vm = VirtualMachine(vm_exec, dev) -vm.set_input("main", **{input_name: img}) -tvm_res = vm.run() - -###################################################################### -# Get boxes with score larger than 0.9 -# ------------------------------------ -score_threshold = 0.9 -boxes = tvm_res[0].numpy().tolist() -valid_boxes = [] -for i, score in enumerate(tvm_res[1].numpy().tolist()): - if score > score_threshold: - valid_boxes.append(boxes[i]) - else: - break - -print("Get {} valid boxes".format(len(valid_boxes))) diff --git a/gallery/how_to/deploy_models/deploy_prequantized.py b/gallery/how_to/deploy_models/deploy_prequantized.py deleted file mode 100644 index c55e608baf9b..000000000000 --- a/gallery/how_to/deploy_models/deploy_prequantized.py +++ /dev/null @@ -1,242 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Deploy a Framework-prequantized Model with TVM -============================================== -**Author**: `Masahiro Masuda `_ - -This is a tutorial on loading models quantized by deep learning frameworks into TVM. -Pre-quantized model import is one of the quantization support we have in TVM. More details on -the quantization story in TVM can be found -`here `_. - -Here, we demonstrate how to load and run models quantized by PyTorch, MXNet, and TFLite. -Once loaded, we can run compiled, quantized models on any hardware TVM supports. -""" - - -################################################################################# -# First, necessary imports -from PIL import Image - -import numpy as np - -import torch -from torchvision.models.quantization import mobilenet as qmobilenet - -import tvm -from tvm import relay -from tvm.contrib.download import download_testdata - - -################################################################################# -# Helper functions to run the demo -def get_transform(): - import torchvision.transforms as transforms - - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - return transforms.Compose( - [ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ] - ) - - -def get_real_image(im_height, im_width): - img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" - img_path = download_testdata(img_url, "cat.png", module="data") - return Image.open(img_path).resize((im_height, im_width)) - - -def get_imagenet_input(): - im = get_real_image(224, 224) - preprocess = get_transform() - pt_tensor = preprocess(im) - return np.expand_dims(pt_tensor.numpy(), 0) - - -def get_synset(): - synset_url = "".join( - [ - "https://gist.githubusercontent.com/zhreshold/", - "4d0b62f3d01426887599d4f7ede23ee5/raw/", - "596b27d23537e5a1b5751d2b0481ef172f58b539/", - "imagenet1000_clsid_to_human.txt", - ] - ) - synset_name = "imagenet1000_clsid_to_human.txt" - synset_path = download_testdata(synset_url, synset_name, module="data") - with open(synset_path) as f: - return eval(f.read()) - - -def run_tvm_model(mod, params, input_name, inp, target="llvm"): - with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) - - runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](tvm.device(target, 0))) - - runtime.set_input(input_name, inp) - runtime.run() - return runtime.get_output(0).numpy(), runtime - - -################################################################################# -# A mapping from label to class name, to verify that the outputs from models below -# are reasonable -synset = get_synset() - -################################################################################# -# Everyone's favorite cat image for demonstration -inp = get_imagenet_input() - -################################################################################ -# Deploy a quantized PyTorch Model -# -------------------------------- -# First, we demonstrate how to load deep learning models quantized by PyTorch, -# using our PyTorch frontend. -# -# Please refer to the PyTorch static quantization tutorial below to learn about -# their quantization workflow. -# https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html -# -# We use this function to quantize PyTorch models. -# In short, this function takes a floating point model and converts it to uint8. -# The model is per-channel quantized. - - -def quantize_model(model, inp): - model.fuse_model() - model.qconfig = torch.quantization.get_default_qconfig("fbgemm") - torch.quantization.prepare(model, inplace=True) - # Dummy calibration - model(inp) - torch.quantization.convert(model, inplace=True) - - -############################################################################## -# Load quantization-ready, pretrained Mobilenet v2 model from torchvision -# ----------------------------------------------------------------------- -# We choose mobilenet v2 because this model was trained with quantization aware -# training. Other models require a full post training calibration. -qmodel = qmobilenet.mobilenet_v2(pretrained=True).eval() - -############################################################################## -# Quantize, trace and run the PyTorch Mobilenet v2 model -# ------------------------------------------------------ -# The details are out of scope for this tutorial. Please refer to the tutorials -# on the PyTorch website to learn about quantization and jit. -pt_inp = torch.from_numpy(inp) -quantize_model(qmodel, pt_inp) -script_module = torch.jit.trace(qmodel, pt_inp).eval() - -with torch.no_grad(): - pt_result = script_module(pt_inp).numpy() - -############################################################################## -# Convert quantized Mobilenet v2 to Relay-QNN using the PyTorch frontend -# ---------------------------------------------------------------------- -# The PyTorch frontend has support for converting a quantized PyTorch model to -# an equivalent Relay module enriched with quantization-aware operators. -# We call this representation Relay QNN dialect. -# -# You can print the output from the frontend to see how quantized models are -# represented. -# -# You would see operators specific to quantization such as -# qnn.quantize, qnn.dequantize, qnn.requantize, and qnn.conv2d etc. -input_name = "input" # the input name can be arbitrary for PyTorch frontend. -input_shapes = [(input_name, (1, 3, 224, 224))] -mod, params = relay.frontend.from_pytorch(script_module, input_shapes) -# print(mod) # comment in to see the QNN IR dump - -############################################################################## -# Compile and run the Relay module -# -------------------------------- -# Once we obtained the quantized Relay module, the rest of the workflow -# is the same as running floating point models. Please refer to other -# tutorials for more details. -# -# Under the hood, quantization specific operators are lowered to a sequence of -# standard Relay operators before compilation. -target = "llvm" -tvm_result, rt_mod = run_tvm_model(mod, params, input_name, inp, target=target) - -########################################################################## -# Compare the output labels -# ------------------------- -# We should see identical labels printed. -pt_top3_labels = np.argsort(pt_result[0])[::-1][:3] -tvm_top3_labels = np.argsort(tvm_result[0])[::-1][:3] - -print("PyTorch top3 labels:", [synset[label] for label in pt_top3_labels]) -print("TVM top3 labels:", [synset[label] for label in tvm_top3_labels]) - -########################################################################################### -# However, due to the difference in numerics, in general the raw floating point -# outputs are not expected to be identical. Here, we print how many floating point -# output values are identical out of 1000 outputs from mobilenet v2. -print("%d in 1000 raw floating outputs identical." % np.sum(tvm_result[0] == pt_result[0])) - -########################################################################## -# Measure performance -# ------------------------- -# Here we give an example of how to measure performance of TVM compiled models. -n_repeat = 100 # should be bigger to make the measurement more accurate -dev = tvm.cpu(0) -print(rt_mod.benchmark(dev, number=1, repeat=n_repeat)) - -###################################################################### -# .. note:: -# -# We recommend this method for the following reasons: -# -# * Measurements are done in C++, so there is no Python overhead -# * It includes several warm up runs -# * The same method can be used to profile on remote devices (android etc.). - - -###################################################################### -# .. note:: -# -# Unless the hardware has special support for fast 8 bit instructions, quantized models are -# not expected to be any faster than FP32 models. Without fast 8 bit instructions, TVM does -# quantized convolution in 16 bit, even if the model itself is 8 bit. -# -# For x86, the best performance can be achieved on CPUs with AVX512 instructions set. -# In this case, TVM utilizes the fastest available 8 bit instructions for the given target. -# This includes support for the VNNI 8 bit dot product instruction (CascadeLake or newer). -# -# Moreover, the following general tips for CPU performance equally applies: -# -# * Set the environment variable TVM_NUM_THREADS to the number of physical cores -# * Choose the best target for your hardware, such as "llvm -mcpu=skylake-avx512" or -# "llvm -mcpu=cascadelake" (more CPUs with AVX512 would come in the future) - - -############################################################################### -# Deploy a quantized MXNet Model -# ------------------------------ -# TODO - -############################################################################### -# Deploy a quantized TFLite Model -# ------------------------------- -# TODO diff --git a/gallery/how_to/deploy_models/deploy_prequantized_tflite.py b/gallery/how_to/deploy_models/deploy_prequantized_tflite.py deleted file mode 100644 index 2d0e225dce39..000000000000 --- a/gallery/how_to/deploy_models/deploy_prequantized_tflite.py +++ /dev/null @@ -1,263 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Deploy a Framework-prequantized Model with TVM - Part 3 (TFLite) -================================================================ -**Author**: `Siju Samuel `_ - -Welcome to part 3 of the Deploy Framework-Prequantized Model with TVM tutorial. -In this part, we will start with a Quantized TFLite graph and then compile and execute it via TVM. - - -For more details on quantizing the model using TFLite, readers are encouraged to -go through `Converting Quantized Models -`_. - -The TFLite models can be downloaded from this `link -`_. - -To get started, Tensorflow and TFLite package needs to be installed as prerequisite. - -.. code-block:: bash - - # install tensorflow and tflite - pip install tensorflow==2.1.0 - pip install tflite==2.1.0 - -Now please check if TFLite package is installed successfully, ``python -c "import tflite"`` - -""" - - -############################################################################### -# Necessary imports -# ----------------- -import os - -import numpy as np -import tflite - -import tvm -from tvm import relay - - -###################################################################### -# Download pretrained Quantized TFLite model -# ------------------------------------------ - -# Download mobilenet V2 TFLite model provided by Google -from tvm.contrib.download import download_testdata - -model_url = ( - "https://storage.googleapis.com/download.tensorflow.org/models/" - "tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz" -) - -# Download model tar file and extract it to get mobilenet_v2_1.0_224.tflite -model_path = download_testdata( - model_url, "mobilenet_v2_1.0_224_quant.tgz", module=["tf", "official"] -) -model_dir = os.path.dirname(model_path) - - -###################################################################### -# Utils for downloading and extracting zip files -# ---------------------------------------------- -def extract(path): - import tarfile - - if path.endswith("tgz") or path.endswith("gz"): - dir_path = os.path.dirname(path) - tar = tarfile.open(path) - tar.extractall(path=dir_path) - tar.close() - else: - raise RuntimeError("Could not decompress the file: " + path) - - -extract(model_path) - - -###################################################################### -# Load a test image -# ----------------- - -####################################################################### -# Get a real image for e2e testing -# -------------------------------- -def get_real_image(im_height, im_width): - from PIL import Image - - repo_base = "https://github.com/dmlc/web-data/raw/main/tensorflow/models/InceptionV1/" - img_name = "elephant-299.jpg" - image_url = os.path.join(repo_base, img_name) - img_path = download_testdata(image_url, img_name, module="data") - image = Image.open(img_path).resize((im_height, im_width)) - x = np.array(image).astype("uint8") - data = np.reshape(x, (1, im_height, im_width, 3)) - return data - - -data = get_real_image(224, 224) - -###################################################################### -# Load a tflite model -# ------------------- - -###################################################################### -# Now we can open mobilenet_v2_1.0_224.tflite -tflite_model_file = os.path.join(model_dir, "mobilenet_v2_1.0_224_quant.tflite") -tflite_model_buf = open(tflite_model_file, "rb").read() - -# Get TFLite model from buffer -try: - import tflite - - tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0) -except AttributeError: - import tflite.Model - - tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) - -############################################################################### -# Lets run TFLite pre-quantized model inference and get the TFLite prediction. -def run_tflite_model(tflite_model_buf, input_data): - """Generic function to execute TFLite""" - try: - from tensorflow import lite as interpreter_wrapper - except ImportError: - from tensorflow.contrib import lite as interpreter_wrapper - - input_data = input_data if isinstance(input_data, list) else [input_data] - - interpreter = interpreter_wrapper.Interpreter(model_content=tflite_model_buf) - interpreter.allocate_tensors() - - input_details = interpreter.get_input_details() - output_details = interpreter.get_output_details() - - # set input - assert len(input_data) == len(input_details) - for i in range(len(input_details)): - interpreter.set_tensor(input_details[i]["index"], input_data[i]) - - # Run - interpreter.invoke() - - # get output - tflite_output = list() - for i in range(len(output_details)): - tflite_output.append(interpreter.get_tensor(output_details[i]["index"])) - - return tflite_output - - -############################################################################### -# Lets run TVM compiled pre-quantized model inference and get the TVM prediction. -def run_tvm(lib): - from tvm.contrib import graph_executor - - rt_mod = graph_executor.GraphModule(lib["default"](tvm.cpu(0))) - rt_mod.set_input("input", data) - rt_mod.run() - tvm_res = rt_mod.get_output(0).numpy() - tvm_pred = np.squeeze(tvm_res).argsort()[-5:][::-1] - return tvm_pred, rt_mod - - -############################################################################### -# TFLite inference -# ---------------- - -############################################################################### -# Run TFLite inference on the quantized model. -tflite_res = run_tflite_model(tflite_model_buf, data) -tflite_pred = np.squeeze(tflite_res).argsort()[-5:][::-1] - -############################################################################### -# TVM compilation and inference -# ----------------------------- - -############################################################################### -# We use the TFLite-Relay parser to convert the TFLite pre-quantized graph into Relay IR. Note that -# frontend parser call for a pre-quantized model is exactly same as frontend parser call for a FP32 -# model. We encourage you to remove the comment from print(mod) and inspect the Relay module. You -# will see many QNN operators, like, Requantize, Quantize and QNN Conv2D. -dtype_dict = {"input": data.dtype.name} -shape_dict = {"input": data.shape} - -mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict) -# print(mod) - -############################################################################### -# Lets now the compile the Relay module. We use the "llvm" target here. Please replace it with the -# target platform that you are interested in. -target = "llvm" -with tvm.transform.PassContext(opt_level=3): - lib = relay.build_module.build(mod, target=target, params=params) - -############################################################################### -# Finally, lets call inference on the TVM compiled module. -tvm_pred, rt_mod = run_tvm(lib) - -############################################################################### -# Accuracy comparison -# ------------------- - -############################################################################### -# Print the top-5 labels for MXNet and TVM inference. -# Checking the labels because the requantize implementation is different between -# TFLite and Relay. This cause final output numbers to mismatch. So, testing accuracy via labels. - -print("TVM Top-5 labels:", tvm_pred) -print("TFLite Top-5 labels:", tflite_pred) - - -########################################################################## -# Measure performance -# ------------------- -# Here we give an example of how to measure performance of TVM compiled models. -n_repeat = 100 # should be bigger to make the measurement more accurate -dev = tvm.cpu(0) -print(rt_mod.benchmark(dev, number=1, repeat=n_repeat)) - -###################################################################### -# .. note:: -# -# Unless the hardware has special support for fast 8 bit instructions, quantized models are -# not expected to be any faster than FP32 models. Without fast 8 bit instructions, TVM does -# quantized convolution in 16 bit, even if the model itself is 8 bit. -# -# For x86, the best performance can be achieved on CPUs with AVX512 instructions set. -# In this case, TVM utilizes the fastest available 8 bit instructions for the given target. -# This includes support for the VNNI 8 bit dot product instruction (CascadeLake or newer). -# For EC2 C5.12x large instance, TVM latency for this tutorial is ~2 ms. -# -# Intel conv2d NCHWc schedule on ARM gives better end-to-end latency compared to ARM NCHW -# conv2d spatial pack schedule for many TFLite networks. ARM winograd performance is higher but -# it has a high memory footprint. -# -# Moreover, the following general tips for CPU performance equally applies: -# -# * Set the environment variable TVM_NUM_THREADS to the number of physical cores -# * Choose the best target for your hardware, such as "llvm -mcpu=skylake-avx512" or -# "llvm -mcpu=cascadelake" (more CPUs with AVX512 would come in the future) -# * Perform autotuning - :ref:`Auto-tuning a convolution network for x86 CPU -# `. -# * To get best inference performance on ARM CPU, change target argument -# according to your device and follow :ref:`Auto-tuning a convolution -# network for ARM CPU `. diff --git a/gallery/how_to/deploy_models/deploy_sparse.py b/gallery/how_to/deploy_models/deploy_sparse.py deleted file mode 100644 index c90a3b566e7a..000000000000 --- a/gallery/how_to/deploy_models/deploy_sparse.py +++ /dev/null @@ -1,363 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Deploy a Hugging Face Pruned Model on CPU -========================================= -**Author**: `Josh Fromm `_ - -This tutorial demonstrates how to take any pruned model, in this case `PruneBert -from Hugging Face -`_, -and use TVM to leverage the model's sparsity support to produce real speedups. Although -the primary purpose of this tutorial is to realize speedups on already pruned -models, it may also be useful to estimate how fast a model would be *if* it were -pruned. To this end, we also provide a function that takes an unpruned model and -replaces its weights -with random and pruned weights at a specified sparsity. This may be a useful -feature when trying to decide if a model is worth pruning or not. - -Before we get into the code, it's useful to discuss sparsity and pruning -and dig into the two -different types of sparsity: **structured** and **unstructured**. - -Pruning is a technique primarily used to reduce the parameter size of a model -by replacing weight values with 0s. Although many methods exist for choosing which -weights should be set to 0, the most straight forward is by picking the -weights with the smallest value. Typically, weights are pruned to a desired -sparsity percentage. For example, a 95% sparse model would have only 5% of -its weights non-zero. Pruning to very high sparsities often requires -fine-tuning or full retraining as it tends to be a lossy approximation. -Although parameter size benefits are quite easy to obtain from a pruned model -through simple compression, leveraging sparsity to yield runtime speedups -is more complicated. - -In structured sparsity weights are pruned with the goal of clustering -pruned weights together. In other words, they are pruned using both their -value and location. The benefit of bunching up pruned weights is that it allows -an algorithm such as matrix multiplication to skip entire blocks. It turns out -that some degree of *block sparsity* is very important to realizing significant -speedups on most hardware available today. -This is because when loading memory in most CPUs or GPUs, -it doesn't save any work to skip reading a single value at a time, instead an entire -chunk or tile is read in and executed using something like vectorized instructions. - -Unstructured sparse weights are those that are pruned only on the value of -the original weights. They may appear to be scattered randomly throughout -a tensor rather than in chunks like we'd see in block sparse weights. -At low sparsities, unstructured pruning techniques are difficult to -accelerate. However, at high sparsities many blocks of all 0 values -will naturally appear, making it possible to accelerate. - -This tutorial interacts with both structured and unstructured sparsity. -Hugging Face's PruneBert model is unstructured but 95% sparse, allowing us -to apply TVM's block sparse optimizations to it, even if not optimally. -When generating random sparse weights for an unpruned model, we do so with structured -sparsity. A fun exercise is comparing the real speed of PruneBert with the block -sparse speed using fake weights to see the benefit of structured sparsity. -""" - - -############################################################################### -# Load Required Modules -# --------------------- -# Other than TVM, scipy, the latest transformers, and -# tensorflow 2.2+ are required. -import os -import tvm -import time -import itertools -import numpy as np -import tensorflow as tf -from tvm import relay, runtime -from tvm.contrib import graph_executor -from tvm.relay import data_dep_optimization as ddo -from tensorflow.python.framework.convert_to_constants import ( - convert_variables_to_constants_v2, -) -import scipy.sparse as sp - - -# Ask tensorflow to limit its GPU memory to what's actually needed -# instead of gobbling everything that's available. -# https://www.tensorflow.org/guide/gpu#limiting_gpu_memory_growth -# This way this tutorial is a little more friendly to sphinx-gallery. -gpus = tf.config.list_physical_devices("GPU") -if gpus: - try: - for gpu in gpus: - tf.config.experimental.set_memory_growth(gpu, True) - print("tensorflow will use experimental.set_memory_growth(True)") - except RuntimeError as e: - print("experimental.set_memory_growth option is not available: {}".format(e)) - - -############################################################################### -# Configure Settings -# ------------------ -# Let's start by defining some parameters that define the type of model -# and sparsity to run. - -# The name of the transformer model to download and run. -name = "huggingface/prunebert-base-uncased-6-finepruned-w-distil-squad" -# The number of batches in an input. -batch_size = 1 -# The length of each input sequence. -seq_len = 128 -# TVM platform identifier. Note that best cpu performance can be achieved by setting -mcpu -# appropriately for your specific machine. CUDA and ROCm are also supported. -target = "llvm" -# Which device to run on. Should be one of tvm.cpu() or tvm.cuda(). -dev = tvm.cpu() -# If true, then a sparse variant of the network will be run and -# benchmarked. -measure_sparse = True -# The block size of structured sparsity to convert weight tensors -# into. Changing this parameter may yield speedups for some platforms. -bs_r = 1 -# For models besides PruneBert (which is 95% sparse), this parameter -# determines how sparse the generated weights should be. The higher -# the sparsity, the faster the result. -sparsity = 0.85 - - -############################################################################### -# Download and Convert Transformers Model -# --------------------------------------- -# Now we'll grab a model from the transformers module, download it, -# convert it into a TensorFlow graphdef in preperation for converting that graphdef into -# a relay graph that we can optimize and deploy. -def load_keras_model(module, name, seq_len, batch_size, report_runtime=True): - model = module.from_pretrained(name) - dummy_input = tf.keras.Input(shape=[seq_len], batch_size=batch_size, dtype="int32") - dummy_out = model(dummy_input) # Propagate shapes through the keras model. - if report_runtime: - np_input = np.random.uniform(size=[batch_size, seq_len], low=0, high=seq_len).astype( - "int32" - ) - start = time.time() - repeats = 50 - for i in range(repeats): - np_out = model(np_input) - end = time.time() - print("Keras Runtime: %f ms." % (1000 * ((end - start) / repeats))) - return model - - -def convert_to_graphdef(model, batch_size, seq_len): - model_func = tf.function(lambda x: model(x)) - input_dict = model._saved_model_inputs_spec - input_spec = input_dict[list(input_dict.keys())[0]] - model_func = model_func.get_concrete_function( - tf.TensorSpec([batch_size, seq_len], input_spec.dtype) - ) - frozen_func = convert_variables_to_constants_v2(model_func) - return frozen_func.graph.as_graph_def() - - -def download_model(name, batch_size, seq_len): - import transformers - - module = getattr(transformers, "TFBertForSequenceClassification") - model = load_keras_model(module, name=name, batch_size=batch_size, seq_len=seq_len) - return convert_to_graphdef(model, batch_size, seq_len) - - -############################################################################### -# Convert to Relay Graph -# ---------------------- -# We now have all the tooling to get a transformers model in the right format -# for relay conversion. Let's import it! In the following function we -# save the imported graph in relay's json format so that we dont have -# to reimport from tensorflow each time this script is run. -def import_graphdef( - name, - batch_size, - seq_len, - save_relay=True, - relay_file="model.json", - relay_params="model.params", -): - abs_path = os.path.dirname(os.path.abspath(__file__)) - shape_dict = {"input_1": (batch_size, seq_len)} - relay_file = ("%s_%d_%d_%s" % (name, batch_size, seq_len, relay_file)).replace("/", "_") - relay_params = ("%s_%d_%d_%s" % (name, batch_size, seq_len, relay_params)).replace("/", "_") - if os.path.exists(os.path.join(abs_path, relay_file)) and os.path.exists( - os.path.join(abs_path, relay_params) - ): - with open(os.path.join(abs_path, relay_file), "r") as fi: - mod = tvm.ir.load_json(fi.read()) - with open(os.path.join(abs_path, relay_params), "rb") as fi: - params = relay.load_param_dict(fi.read()) - else: - graph_def = download_model(name, batch_size, seq_len) - - mod, params = relay.frontend.from_tensorflow(graph_def, shape=shape_dict) - - if save_relay: - with open(os.path.join(abs_path, relay_file), "w") as fo: - fo.write(tvm.ir.save_json(mod)) - with open(os.path.join(abs_path, relay_params), "wb") as fo: - fo.write(runtime.save_param_dict(params)) - - return mod, dict(params.items()), shape_dict - - -############################################################################### -# Run the Dense Graph -# ------------------- -# Let's run the default version of the imported model. Note that even if -# the weights are sparse, we won't see any speedup because we are using -# regular dense matrix multiplications on these dense (but mostly zero) -# tensors instead of sparse aware kernels. -def run_relay_graph(mod, params, shape_dict, target, dev): - with relay.build_config(opt_level=3): - lib = relay.build(mod, target=target, params=params) - input_shape = shape_dict["input_1"] - dummy_data = np.random.uniform(size=input_shape, low=0, high=input_shape[1]).astype("int32") - - m = graph_executor.GraphModule(lib["default"](dev)) - m.set_input(0, dummy_data) - m.run() - tvm_output = m.get_output(0) - - print(m.benchmark(dev, repeat=5, number=5)) - return tvm_output - - -def run_dense(mod, params, shape_dict, target, dev): - print("Dense Model Benchmark:") - return run_relay_graph(mod, params, shape_dict, target, dev) - - -############################################################################### -# Run the Sparse Graph -# -------------------- -# Next we'll convert the graph into a sparse representation and generate -# fake sparse weights if needed. Then we'll use the same benchmarking -# script as dense to see how much faster we go! We apply a few relay passes -# to the graph to get it leveraging sparsity. First we use -# `simplify_fc_transpose` to use transposes on the weights of dense layers -# into the parameters. This makes it easier to convert to matrix multiplies -# to sparse versions. Next we apply `bsr_dense.convert` to identify all -# weight matrices that can be sparse, and automatically replace them. -# -# The `bsr_dense.convert` call below is doing the heavy lifting of identifying -# which weights in the model can be made sparse by checking if they are -# at least `sparsity_threshold` percent sparse. If so, it converts those -# weights into *Block Compressed Row Format (BSR)*. BSR is essentially -# a representation that indexes into the nonzero chunks of the tensor, -# making it easy for an algorithm to load those non-zero chunks and ignore -# the rest of the tensor. Once the sparse weights are in BSR format, -# `relay.transform.DenseToSparse` is applied to actually replace -# `relay.dense` operations with `relay.sparse_dense` calls that can be -# run faster. -def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype="float32"): - Y = np.zeros((M, N), dtype=dtype) - assert M % BS_R == 0 - assert N % BS_C == 0 - nnz = int(density * M * N) - num_blocks = int(nnz / (BS_R * BS_C)) + 1 - candidate_blocks = np.asarray(list(itertools.product(range(0, M, BS_R), range(0, N, BS_C)))) - assert candidate_blocks.shape[0] == M // BS_R * N // BS_C - chosen_blocks = candidate_blocks[ - np.random.choice(candidate_blocks.shape[0], size=num_blocks, replace=False) - ] - for i in range(len(chosen_blocks)): - r, c = chosen_blocks[i] - Y[r : r + BS_R, c : c + BS_C] = np.random.uniform(-0.1, 0.1, (BS_R, BS_C)) - s = sp.bsr_matrix(Y, blocksize=(BS_R, BS_C)) - assert s.data.shape == (num_blocks, BS_R, BS_C) - assert s.data.size >= nnz - assert s.indices.shape == (num_blocks,) - assert s.indptr.shape == (M // BS_R + 1,) - return s.todense() - - -def random_sparse_bert_params(func, params, density, BS_R, BS_C): - def deepcopy(param_dic): - ret = {} - for k, v in param_dic.items(): - ret[k] = tvm.nd.array(v.numpy()) - return ret - - new_params = deepcopy(params) - dense_weight_names = relay.analysis.sparse_dense._search_dense_op_weight(func) - for item in dense_weight_names: - name = str(item) - shape = new_params[name].shape - if shape[0] % BS_R == 0 and shape[1] % BS_C == 0: - new_w = random_bsr_matrix(shape[0], shape[1], BS_R, BS_C, density) - new_params[name] = tvm.nd.array(new_w) - return new_params - - -def run_sparse(mod, params, shape_dict, target, dev, bs_r, sparsity, gen_weights): - mod, params = ddo.simplify_fc_transpose.convert(mod["main"], params) - if gen_weights: - params = random_sparse_bert_params(mod, params, BS_R=bs_r, BS_C=1, density=1 - sparsity) - mod, params = ddo.bsr_dense.convert(mod, params, (bs_r, 1), sparsity_threshold=0.8) - print("Block Sparse Model with {blocksize}x1 blocks:".format(blocksize=bs_r)) - return run_relay_graph(mod, params, shape_dict, target, dev) - - -############################################################################### -# Run All the Code! -# ----------------- -# And that's it! Now we'll simply call all the needed function to benchmark -# the model according to the set parameters. Note that to run this code -# you'll need to uncomment the last line first. -def benchmark(): - mod, params, shape_dict = import_graphdef(name, batch_size, seq_len) - run_dense(mod, params, shape_dict, target, dev) - if measure_sparse: - gen_weights = "prune" not in name - run_sparse(mod, params, shape_dict, target, dev, bs_r, sparsity, gen_weights) - - -# benchmark() - -############################################################################### -# Sample Output -# ------------- -# For reference, below is the output of the script when run on an AMD CPU -# and shows about a 2.5X speedup from using sparsity. - -# Dense Model Benchmark: -# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (128, 3072), 'float32'), ('TENSOR', (768, 3072), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (3072, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=llvm, workload=('dense_nopack.x86', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=llvm, workload=('batch_matmul.x86', ('TENSOR', (12, 128, 128), 'float32'), ('TENSOR', (12, 64, 128), 'float32')). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=llvm, workload=('batch_matmul.x86', ('TENSOR', (12, 128, 64), 'float32'), ('TENSOR', (12, 128, 64), 'float32')). A fallback configuration is used, which may bring great performance regression. -# Runtime: 165.26 ms (12.83 ms) -# Block Sparse Model with 1x1 blocks: -# Runtime: 67.75 ms (8.83 ms) - -# Here is the output of this script on a GPU (GTX 1070) with the target "cuda -libs=cublas". -# -# Dense Model Benchmark: -# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 3072), 'float32'), ('TENSOR', (768, 3072), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (3072, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 128), 'float32'), ('TENSOR', (12, 64, 128), 'float32'), (12, 128, 64)). A fallback configuration is used, which may bring great performance regression. -# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 64), 'float32'), ('TENSOR', (12, 128, 64), 'float32'), (12, 128, 128)). A fallback configuration is used, which may bring great performance regression. -# Runtime: 10.64 ms (0.29 ms) -# Block Sparse Model with 1x1 blocks: -# Runtime: 6.46 ms (0.05 ms) diff --git a/gallery/how_to/extend_tvm/README.txt b/gallery/how_to/extend_tvm/README.txt deleted file mode 100644 index 09cd220c80b3..000000000000 --- a/gallery/how_to/extend_tvm/README.txt +++ /dev/null @@ -1,7 +0,0 @@ -Extend TVM ----------- - -TVM is an extensible development platform, with many points of entry to work -with, including options for bringing new datatypes and adding lower level -custom optimization passes. These how-tos describe some ways that TVM can be -extended. diff --git a/gallery/how_to/extend_tvm/low_level_custom_pass.py b/gallery/how_to/extend_tvm/low_level_custom_pass.py deleted file mode 100644 index 50634116ce8e..000000000000 --- a/gallery/how_to/extend_tvm/low_level_custom_pass.py +++ /dev/null @@ -1,172 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Writing a Customized Pass -========================= -**Author**: `Jian Weng `_ - -TVM is a framework that abstracts away the heterogenity of machine learning accelerators. -Sometimes users may want customize some analysis and IR transformations -to adapt TVM to their own specialized hardware. This tutorial helps users write -a customized pass in TVM. - -Prerequisites -------------- - -Before reading this tutorial, we assume readers have already known these topics well: - -- Writing an algorithm in TVM and schedule it. Otherwise, see example tutorials like - :ref:`opt-gemm`. -- The basic structure of HalideIR. Otherwise, see ``HalideIR/src/ir/IR.h`` to learn what - attributes of IR nodes are defined. -- Visitor design pattern. Otherwise, check the - `Python AST module `_ to see how an AST - visitor is implemented. -- How a Schedule is lowered to either an IRModule class or a LLVM module. Otherwise, - take a look at ``python/tvm/build_module.py`` to get some basics. - -""" - -import tvm -from tvm import te -import numpy as np - -###################################################################### -# We first write a very simple vector add and build it with the default schedule. Then, we use -# our customized lowering pass to manipulate the IR directly instead of using schedule primitives. -# - -n = tvm.tir.const(128, "int32") -a = te.placeholder((n,), name="a") -b = te.placeholder((n,), name="b") -c = te.compute((n,), lambda i: a[i] + b[i], name="c") - -sch = te.create_schedule(c.op) -ir = tvm.lower(sch, [a, b, c]) -print(ir) - -###################################################################### -# Writing a Pass -# -------------- -# Essentially, an "IR transformation pass" is a function which maps a statement to a new statement. -# Thus, we define this vectorize function and implement it step by step. -# - -###################################################################### -# TVM already provides two class for users to both analyze and transform IR. -# -# IR Visitor -# ~~~~~~~~~~ -# We can use ``tvm.tir.stmt_functor.post_order_visit(stmt, func)`` to gather information from the Halide IR. -# ``func`` is a function callback. This function will be called before exiting the current IR node, -# i.e. post-order visit. Then we leverage side effects to store the result of IR visit, because the -# return value of ``func`` will be ignored. -# -# .. note:: -# -# You MUST use some array to store the result of IR visit. Even the value is a single variable. -# This is mainly due to the constraints in the Python-C runtime. The variable values will be -# refreshed every recursion but the array values will be preserved. -# - -loops = [] - - -def find_width8(op): - """Find all the 'tir.For' nodes whose extent can be divided by 8.""" - if isinstance(op, tvm.tir.For): - if isinstance(op.extent, tvm.tir.IntImm): - if op.extent.value % 8 == 0: - loops.append(op) - - -##################################################################### -# IR Transformation -# ~~~~~~~~~~~~~~~~~ -# The transformation interface is slightly different from the visitor interface. There is only a -# post-order callback in the visitor, but transformation visitor supports both a pre-order and a -# post-order callback. If you want to keep the origin IR node, just return None. If you want to -# change the current node to some node, use TVM IR maker interface to build it and return -# this value. -# -# .. note:: -# -# If the pre-order function is called and returns a value which is not None, the post-order -# function will be skipped. -# - - -def vectorize8(op): - """Split can vectorize the loops found in `find_width8`.""" - if op in loops: - extent = op.extent.value - name = op.loop_var.name - lo, li = te.var(name + ".outer"), te.var(name + ".inner") - body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li}) - body = tvm.tir.For(li, 0, 8, tvm.tir.ForKind.VECTORIZED, body) - body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.ForKind.SERIAL, body) - return body - return None - - -@tvm.tir.transform.prim_func_pass(opt_level=0) -def vectorize(f, mod, ctx): - global loops - - tvm.tir.stmt_functor.post_order_visit(f.body, find_width8) - - if not loops: - return f - - # The last list arugment indicates what kinds of nodes will be transformed. - # Thus, in this case only `For` nodes will call `vectorize8` - return f.with_body(tvm.tir.stmt_functor.ir_transform(f.body, None, vectorize8, ["tir.For"])) - - -##################################################################### -# Glue to Lowering -# ---------------- -# So far, we are done with writing this IR transformation pass. What we need to do next is to glue -# this pass to TVM's lower pass. -# -# In this case, we inject the pass written above into the TVM standard lowering -# pass by feeding **a list of tuple** as argument to ``tir.add_lower_pass``. "Tuple" indicates different -# phases of lowering. In TVM, there are four phases of lowering and user-customized ones will be -# called after each phase is done. -# -# .. note:: -# Here are the essential transformations done by each phase: -# - Phase 0 generates the raw IR and loop levels. -# - Phase 1 flattens the array storage. -# - Phase 2 transforms loops, like unroll, vectorization and thread-binding. -# - Phase 3 does some cleanup work. -# -# Thus, a good place to put this transformation pass is just after Phase 1. -# - -with tvm.transform.PassContext(config={"tir.add_lower_pass": [(1, vectorize)]}): - print(tvm.lower(sch, [a, b, c])) - -##################################################################### -# Quick View -# ---------- -# This tutorial gives a quick view of writing a customized IR transformation pass: -# - Use ``tvm.tir.stmt_functor.post_order_visit`` to gather information on each IR nodes. -# - Use ``tvm.tir.stmt_functor.ir_transform`` to transform IR nodes. -# - Wrap up two above to write an IR-transformation function. -# - Use ``tvm.transform.PassContext`` to put this function to TVM lowering pass -# diff --git a/gallery/how_to/extend_tvm/use_pass_infra.py b/gallery/how_to/extend_tvm/use_pass_infra.py deleted file mode 100644 index f82cf40029d4..000000000000 --- a/gallery/how_to/extend_tvm/use_pass_infra.py +++ /dev/null @@ -1,274 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=line-too-long -""" -.. _tutorial-use-pass-infra: - -How to Use TVM Pass Infra -========================= -**Author**: `Zhi Chen `_ - -As the number of optimization passes increases in Relay/tir, it becomes intractable to -execute them and maintain their dependencies manually. Therefore, we have -introduced an infrastructure to manage the optimization passes and make it -applicable to different layers of the IR in the TVM stack. - -The optimizations of a Relay/tir program could be applied at various granularity, -namely function-level and module-level using :py:class:`tvm.relay.transform.FunctionPass`/ -:py:class:`tvm.tir.transform.PrimFuncPass` and :py:class:`tvm.transform.ModulePass` -respectively. Or users can rely on :py:class:`tvm.transform.Sequential` to apply a sequence of passes -on a Relay/tir program where the dependencies between passes can be resolved by the -pass infra. For more details about each type of these passes, please refer to -the :ref:`pass-infra` - -This tutorial mainly demonstrates how developers can use the pass infra to perform -a certain optimization and create an optimization pipeline for a Relay program. -The same approach can be used for tir as well. -""" - - -import numpy as np -import tvm -from tvm import te -import tvm.relay as relay - -############################################################################### -# Create An Example Relay Program -# ------------------------------- -# First of all, we create a simple Relay program for the tutorial. This program -# will be used by various optimizations of the examples in this tutorial. -# Similarly, users can write a tir primitive function and apply the tir passes. - - -def example(): - shape = (1, 64, 54, 54) - c_data = np.empty(shape).astype("float32") - c = relay.const(c_data) - weight = relay.var("weight", shape=(64, 64, 3, 3)) - x = relay.var("x", relay.TensorType((1, 64, 56, 56), "float32")) - conv = relay.nn.conv2d(x, weight) - y = relay.add(c, c) - y = relay.multiply(y, relay.const(2, "float32")) - y = relay.add(conv, y) - z = relay.add(y, c) - z1 = relay.add(y, c) - z2 = relay.add(z, z1) - return relay.Function([x, weight], z2) - - -############################################################################### -# Optimize the Program -# -------------------- -# Now we would like to optimize the program. Relay features a host of -# optimizations. We will select some of them to apply on this example program. -# -# There are multiple ways to optimize a Relay program. Below we will provide -# examples for each of them. -# -# Manually Apply Optimization Passes -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -# Let's first create a relay Module which contains one or multiple Relay -# functions for optimization. -f = example() -mod = tvm.IRModule.from_expr(f) - -# Now we can apply constant folding on the module. -# fold_const here is a callback that doesn't take any parameters. -fold_const = relay.transform.FoldConstant() -# Then, we can invoke the pass on the given module. Note that the constant -# folding pass works at the function-level. That being said, each function in -# the module will be applied with the optimization. Users don't need to iterate -# through individual functions manually to apply this pass. -mod = fold_const(mod) -# We can see from the updated program that the constants are folded. -print(mod) - -############################################################################### -# More optimizations can be applied in the similar manner. For instance, we can -# eliminate the common expressions that used by `z` and `z1`. -mod = relay.transform.EliminateCommonSubexpr()(mod) -print(mod) - -############################################################################### -# Some optimizations, such as fusion, are parametric as well. For example, -# opt level 0 will not allow operators to be fused together. Users can pass the -# `fuse_opt_level` to enable this. -mod = relay.transform.FuseOps(fuse_opt_level=0)(mod) - -# We can observe that the optimized module contains functions that only have -# a signle primitive op. -print(mod) - -############################################################################### -# Use Sequential to Apply a Sequence of Passes -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Applying passes as above is actually tedious and it may require users to have -# better understanding about the dependencies between them. For example, fusion -# currently doesn't work well on let bindings. Therefore, we would not be able -# to fuse operators that were fusable if :py:func:`relay.transform.ToANormalForm` is applied before -# fusion, as this pass generates let bindings for each expression to -# canonicalize a Relay program. -# -# Relay, hence, provides :py:class:`tvm.transform.Sequential` to alleviate developers from handling -# these issues explicitly by specifying the required passes of each pass and -# packing them as a whole to execute. For example, the same passes can now be -# applied using the sequential style as the following. :py:class:`tvm.transform.Sequential` is -# similar to `torch.nn.sequential `_ -# and `mxnet.gluon.block `_. -# For example, `torch.nn.sequential` is used to contain a sequence of PyTorch -# `Modules` that will be added to build a network. It focuses on the network -# layers. Instead, the :py:class:`tvm.transform.Sequential` in our pass infra works on the optimizing -# pass. - -# Now let's execute some passes through :py:class:`tvm.transform.Sequential` -f = example() -mod = tvm.IRModule.from_expr(f) -# Glob the interested passes. -seq = tvm.transform.Sequential( - [ - relay.transform.FoldConstant(), - relay.transform.EliminateCommonSubexpr(), - relay.transform.FuseOps(fuse_opt_level=2), - ] -) -mod1 = seq(mod) -print(mod1) - -############################################################################### -# From the transformed Relay program, we can see that there are still two -# identical addition operations. This is because ``EliminateCommonSubexpr`` -# was not actually performed. The reason is because only the passes that have -# optimization level less or equal to 2 will be executed by default under -# :py:class:`tvm.transform.Sequential`. The pass infra, -# however, provides a configuration interface -# for users to customize the optimization level that they want to execute. - -with tvm.transform.PassContext(opt_level=3): - mod2 = seq(mod) -print(mod2) - -############################################################################### -# Now we can see that only one of the two identical additions is kept. -# -# In addition, users can selectively disable some passes using the -# `disabled_pass` config, which is similar to the `-fno-xxx` option used the -# general purpose compilers, such as Clang and GCC. For example, we can disable -# EliminateCommonSubexpr as following. The printed module will again show two -# identical addition operations. - -with tvm.transform.PassContext(opt_level=3, disabled_pass=["EliminateCommonSubexpr"]): - mod3 = seq(mod) -print(mod3) - -############################################################################## -# Implement a Pass Using Python Decorator -# ------------------------------------------ -# The next example illustrates how we can orchestrate a customized optimization -# pipeline through the pass infra using Python decorators. This functionality -# greatly eases the implementation of passes. For example, users can simply -# define a decorated class to do function-level optimizations as the following -# example shows. `transform_function` wraps a class to replace all constants -# with a multiple of `c`. Later on, each function in a given module will be -# visited and each constant in the function will be replaced when we invoke the -# customized pass. - - -@relay.transform.function_pass(opt_level=1) -class CustomPipeline: - """Simple test function to replace one argument to another.""" - - def __init__(self, multiplier): - self.multiplier = multiplier - - # This function can define a pass. - def transform_function(self, func, mod, ctx): - obj = self - - class ReplaceConstant(tvm.relay.ExprMutator): - def visit_constant(self, c): - return relay.multiply(obj.multiplier, c) - - return ReplaceConstant().visit(func) - - -f = example() -mod = tvm.IRModule.from_expr(f) -custom_pass = CustomPipeline(multiplier=relay.const(3, "float32")) -assert custom_pass.info.name == "CustomPipeline" -mod3 = custom_pass(mod) -print(mod3) - -############################################################################## -# Debug a Pass -# ------------ -# TVM provides users a plug-and-play style debugging pass that print the IR -# after a certain pass is done through a special pass (``PrintIR``) to dump the IR of the -# whole module. A slightly modified version of the sequential pass example -# could be like the following to enable IR dumping for ``FoldConstant`` optimization. - -f = example() -mod = tvm.IRModule.from_expr(f) -seq = tvm.transform.Sequential( - [ - relay.transform.FoldConstant(), - tvm.transform.PrintIR(), - relay.transform.EliminateCommonSubexpr(), - relay.transform.FuseOps(), - ] -) - -############################################################################### -# By inserting the ``PrintIR`` pass after ``FoldConstant``, the pass infra will -# dump out the module IR when ``FoldConstant`` is done. Users can plug in this -# pass after any pass they want to debug for viewing the optimization effect. -# -# There is a more flexible debugging mechanism. One can implement a ``PassInstrument`` -# class to execute arbitrary code not only before and/or after each pass but also -# at entering/exiting ``PassContext``. See :ref:`pass_instrument_cpp_backend` -# for more details. -# -# Here we use :py::func`tvm.instrument.pass_instrument` decorator to implement -# a PassInsturment class printing IR before execution of each passes: - - -@tvm.instrument.pass_instrument -class PrintIR: - """Print the name of the pass, the IR, only before passes execute.""" - - def run_before_pass(self, mod, info): - print("Running pass: {}", info) - print(mod) - - -with tvm.transform.PassContext(opt_level=3, instruments=[PrintIR()]): - with tvm.target.Target("llvm"): - # Perform the optimizations. - mod = seq(mod) -print(mod) - -print("done") - -############################################################################## -# Summary -# ------- -# This tutorial has covered how we can write and invoke passes in TVM more -# conveniently using the pass infra. Different ways of invoking a pass are also -# discussed. Using :py:class:`tvm.transform.Sequential` can largely help -# users to ease the work of handling multiple optimization passes and their -# dependencies. In addition, an example is provided to illustrate -# how we can debug a pass using the ``PrintIR`` and tracing. diff --git a/gallery/how_to/extend_tvm/use_pass_instrument.py b/gallery/how_to/extend_tvm/use_pass_instrument.py deleted file mode 100644 index fd965cdf973a..000000000000 --- a/gallery/how_to/extend_tvm/use_pass_instrument.py +++ /dev/null @@ -1,373 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=line-too-long -""" -.. _tutorial-use-pass-instrument: - -How to Use TVM Pass Instrument -============================== -**Author**: `Chi-Wei Wang `_ - -As more and more passes are implemented, it becomes useful to instrument -pass execution, analyze per-pass effects, and observe various events. - -We can instrument passes by providing a list of :py:class:`tvm.ir.instrument.PassInstrument` -instances to :py:class:`tvm.transform.PassContext`. We provide a pass instrument -for collecting timing information (:py:class:`tvm.ir.instrument.PassTimingInstrument`), -but an extension mechanism is available via the :py:func:`tvm.instrument.pass_instrument` decorator. - -This tutorial demonstrates how developers can use ``PassContext`` to instrument -passes. Please also refer to the :ref:`pass-infra`. -""" - -import tvm -import tvm.relay as relay -from tvm.relay.testing import resnet -from tvm.contrib.download import download_testdata -from tvm.relay.build_module import bind_params_by_name -from tvm.ir.instrument import ( - PassTimingInstrument, - pass_instrument, -) - - -############################################################################### -# Create An Example Relay Program -# ------------------------------- -# We use pre-defined resnet-18 network in Relay. -batch_size = 1 -num_of_image_class = 1000 -image_shape = (3, 224, 224) -output_shape = (batch_size, num_of_image_class) -relay_mod, relay_params = resnet.get_workload(num_layers=18, batch_size=1, image_shape=image_shape) -print("Printing the IR module...") -print(relay_mod.astext(show_meta_data=False)) - - -############################################################################### -# Create PassContext With Instruments -# ----------------------------------- -# To run all passes with an instrument, pass it via the ``instruments`` argument to -# the ``PassContext`` constructor. A built-in ``PassTimingInstrument`` is used to -# profile the execution time of each passes. -timing_inst = PassTimingInstrument() -with tvm.transform.PassContext(instruments=[timing_inst]): - relay_mod = relay.transform.InferType()(relay_mod) - relay_mod = relay.transform.FoldScaleAxis()(relay_mod) - # before exiting the context, get profile results. - profiles = timing_inst.render() -print("Printing results of timing profile...") -print(profiles) - - -############################################################################### -# Use Current PassContext With Instruments -# ---------------------------------------- -# One can also use the current ``PassContext`` and register -# ``PassInstrument`` instances by ``override_instruments`` method. -# Note that ``override_instruments`` executes ``exit_pass_ctx`` method -# if any instrument already exists. Then it switches to new instruments -# and calls ``enter_pass_ctx`` method of new instruments. -# Refer to following sections and :py:func:`tvm.instrument.pass_instrument` for these methods. -cur_pass_ctx = tvm.transform.PassContext.current() -cur_pass_ctx.override_instruments([timing_inst]) -relay_mod = relay.transform.InferType()(relay_mod) -relay_mod = relay.transform.FoldScaleAxis()(relay_mod) -profiles = timing_inst.render() -print("Printing results of timing profile...") -print(profiles) - - -############################################################################### -# Register empty list to clear existing instruments. -# -# Note that ``exit_pass_ctx`` of ``PassTimingInstrument`` is called. -# Profiles are cleared so nothing is printed. -cur_pass_ctx.override_instruments([]) -# Uncomment the call to .render() to see a warning like: -# Warning: no passes have been profiled, did you enable pass profiling? -# profiles = timing_inst.render() - - -############################################################################### -# Create Customized Instrument Class -# ---------------------------------- -# A customized instrument class can be created using the -# :py:func:`tvm.instrument.pass_instrument` decorator. -# -# Let's create an instrument class which calculates the change in number of -# occurrences of each operator caused by each pass. We can look at ``op.name`` to -# find the name of each operator. And we do this before and after passes to calculate the difference. - - -@pass_instrument -class RelayCallNodeDiffer: - def __init__(self): - self._op_diff = [] - # Passes can be nested. - # Use stack to make sure we get correct before/after pairs. - self._op_cnt_before_stack = [] - - def enter_pass_ctx(self): - self._op_diff = [] - self._op_cnt_before_stack = [] - - def exit_pass_ctx(self): - assert len(self._op_cnt_before_stack) == 0, "The stack is not empty. Something wrong." - - def run_before_pass(self, mod, info): - self._op_cnt_before_stack.append((info.name, self._count_nodes(mod))) - - def run_after_pass(self, mod, info): - # Pop out the latest recorded pass. - name_before, op_to_cnt_before = self._op_cnt_before_stack.pop() - assert name_before == info.name, "name_before: {}, info.name: {} doesn't match".format( - name_before, info.name - ) - cur_depth = len(self._op_cnt_before_stack) - op_to_cnt_after = self._count_nodes(mod) - op_diff = self._diff(op_to_cnt_after, op_to_cnt_before) - # only record passes causing differences. - if op_diff: - self._op_diff.append((cur_depth, info.name, op_diff)) - - def get_pass_to_op_diff(self): - """ - return [ - (depth, pass_name, {op_name: diff_num, ...}), ... - ] - """ - return self._op_diff - - @staticmethod - def _count_nodes(mod): - """Count the number of occurrences of each operator in the module""" - ret = {} - - def visit(node): - if isinstance(node, relay.expr.Call): - if hasattr(node.op, "name"): - op_name = node.op.name - else: - # Some CallNode may not have 'name' such as relay.Function - return - ret[op_name] = ret.get(op_name, 0) + 1 - - relay.analysis.post_order_visit(mod["main"], visit) - return ret - - @staticmethod - def _diff(d_after, d_before): - """Calculate the difference of two dictionary along their keys. - The result is values in d_after minus values in d_before. - """ - ret = {} - key_after, key_before = set(d_after), set(d_before) - for k in key_before & key_after: - tmp = d_after[k] - d_before[k] - if tmp: - ret[k] = d_after[k] - d_before[k] - for k in key_after - key_before: - ret[k] = d_after[k] - for k in key_before - key_after: - ret[k] = -d_before[k] - return ret - - -############################################################################### -# Apply Passes and Multiple Instrument Classes -# -------------------------------------------- -# We can use multiple instrument classes in a ``PassContext``. -# However, it should be noted that instrument methods are executed sequentially, -# obeying the order of ``instruments`` argument. -# So for instrument classes like ``PassTimingInstrument``, it is inevitable to -# count-up the execution time of other instrument classes to the final -# profile result. -call_node_inst = RelayCallNodeDiffer() -desired_layouts = { - "nn.conv2d": ["NHWC", "HWIO"], -} -pass_seq = tvm.transform.Sequential( - [ - relay.transform.FoldConstant(), - relay.transform.ConvertLayout(desired_layouts), - relay.transform.FoldConstant(), - ] -) -relay_mod["main"] = bind_params_by_name(relay_mod["main"], relay_params) -# timing_inst is put after call_node_inst. -# So the execution time of ``call_node.inst.run_after_pass()`` is also counted. -with tvm.transform.PassContext(opt_level=3, instruments=[call_node_inst, timing_inst]): - relay_mod = pass_seq(relay_mod) - profiles = timing_inst.render() -# Uncomment the next line to see timing-profile results. -# print(profiles) - - -############################################################################### -# We can see how many CallNode increase/decrease per op type. -from pprint import pprint - -print("Printing the change in number of occurrences of each operator caused by each pass...") -pprint(call_node_inst.get_pass_to_op_diff()) - - -############################################################################### -# Exception Handling -# ------------------ -# Let's see what happens if an exception occurs in a method of a ``PassInstrument``. -# -# Define ``PassInstrument`` classes which raise exceptions in enter/exit ``PassContext``: -class PassExampleBase: - def __init__(self, name): - self._name = name - - def enter_pass_ctx(self): - print(self._name, "enter_pass_ctx") - - def exit_pass_ctx(self): - print(self._name, "exit_pass_ctx") - - def should_run(self, mod, info): - print(self._name, "should_run") - return True - - def run_before_pass(self, mod, pass_info): - print(self._name, "run_before_pass") - - def run_after_pass(self, mod, pass_info): - print(self._name, "run_after_pass") - - -@pass_instrument -class PassFine(PassExampleBase): - pass - - -@pass_instrument -class PassBadEnterCtx(PassExampleBase): - def enter_pass_ctx(self): - print(self._name, "bad enter_pass_ctx!!!") - raise ValueError("{} bad enter_pass_ctx".format(self._name)) - - -@pass_instrument -class PassBadExitCtx(PassExampleBase): - def exit_pass_ctx(self): - print(self._name, "bad exit_pass_ctx!!!") - raise ValueError("{} bad exit_pass_ctx".format(self._name)) - - -############################################################################### -# If an exception occurs in ``enter_pass_ctx``, ``PassContext`` will disable the pass -# instrumentation. And it will run the ``exit_pass_ctx`` of each ``PassInstrument`` -# which successfully finished ``enter_pass_ctx``. -# -# In following example, we can see ``exit_pass_ctx`` of `PassFine_0` is executed after exception. -demo_ctx = tvm.transform.PassContext( - instruments=[ - PassFine("PassFine_0"), - PassBadEnterCtx("PassBadEnterCtx"), - PassFine("PassFine_1"), - ] -) -try: - with demo_ctx: - relay_mod = relay.transform.InferType()(relay_mod) -except ValueError as ex: - print("Catching", str(ex).split("\n")[-1]) - -############################################################################### -# Exceptions in ``PassInstrument`` instances cause all instruments of the current ``PassContext`` -# to be cleared, so nothing is printed when ``override_instruments`` is called. -demo_ctx.override_instruments([]) # no PassFine_0 exit_pass_ctx printed....etc - -############################################################################### -# If an exception occurs in ``exit_pass_ctx``, then the pass instrument is disabled. -# Then exception is propagated. That means ``PassInstrument`` instances registered -# after the one throwing the exception do not execute ``exit_pass_ctx``. -demo_ctx = tvm.transform.PassContext( - instruments=[ - PassFine("PassFine_0"), - PassBadExitCtx("PassBadExitCtx"), - PassFine("PassFine_1"), - ] -) -try: - # PassFine_1 execute enter_pass_ctx, but not exit_pass_ctx. - with demo_ctx: - relay_mod = relay.transform.InferType()(relay_mod) -except ValueError as ex: - print("Catching", str(ex).split("\n")[-1]) - -############################################################################### -# Exceptions occurred in ``should_run``, ``run_before_pass``, ``run_after_pass`` -# are not handled explicitly -- we rely on the context manager (the ``with`` syntax) -# to exit ``PassContext`` safely. -# -# We use ``run_before_pass`` as an example: -@pass_instrument -class PassBadRunBefore(PassExampleBase): - def run_before_pass(self, mod, pass_info): - print(self._name, "bad run_before_pass!!!") - raise ValueError("{} bad run_before_pass".format(self._name)) - - -demo_ctx = tvm.transform.PassContext( - instruments=[ - PassFine("PassFine_0"), - PassBadRunBefore("PassBadRunBefore"), - PassFine("PassFine_1"), - ] -) -try: - # All exit_pass_ctx are called. - with demo_ctx: - relay_mod = relay.transform.InferType()(relay_mod) -except ValueError as ex: - print("Catching", str(ex).split("\n")[-1]) - -############################################################################### -# Also note that pass instrumentation is not disable. So if we call -# ``override_instruments``, the ``exit_pass_ctx`` of old registered ``PassInstrument`` -# is called. -demo_ctx.override_instruments([]) - -############################################################################### -# If we don't wrap pass execution with ``with`` syntax, ``exit_pass_ctx`` is not -# called. Let try this with current ``PassContext``: -cur_pass_ctx = tvm.transform.PassContext.current() -cur_pass_ctx.override_instruments( - [ - PassFine("PassFine_0"), - PassBadRunBefore("PassBadRunBefore"), - PassFine("PassFine_1"), - ] -) - -############################################################################### -# Then call passes. ``exit_pass_ctx`` is not executed after the exception, -# as expectation. -try: - # No ``exit_pass_ctx`` got executed. - relay_mod = relay.transform.InferType()(relay_mod) -except ValueError as ex: - print("Catching", str(ex).split("\n")[-1]) - -############################################################################### -# Clear instruments. -cur_pass_ctx.override_instruments([]) diff --git a/gallery/how_to/optimize_operators/README.txt b/gallery/how_to/optimize_operators/README.txt deleted file mode 100644 index 889974c7873c..000000000000 --- a/gallery/how_to/optimize_operators/README.txt +++ /dev/null @@ -1,5 +0,0 @@ -Optimize Tensor Operators -------------------------- - -These how-tos demonstrate how to optimize a variety of tensor operations for a -variety of targets. diff --git a/gallery/how_to/optimize_operators/opt_conv_cuda.py b/gallery/how_to/optimize_operators/opt_conv_cuda.py deleted file mode 100644 index 1ab38450f5c4..000000000000 --- a/gallery/how_to/optimize_operators/opt_conv_cuda.py +++ /dev/null @@ -1,252 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _opt-conv-gpu: - -How to optimize convolution on GPU -================================== -**Author**: `Haichen Shen `_ - -In this tutorial, we will demonstrate how to write a high performance -convolution implementation in TVM. We use square size input tensors and filters -as an example, and assume the input to convolution has a large batch. In this -example, we use a different layout to store the data in order to achieve better -data locality. The buffer layout is HWCN, which stands for height, width, -channel, batch. - -""" - -################################################################ -# Preparation and Algorithm -# ------------------------- -# -# We use the fixed size for input tensors with 256 channels and 14 x 14 -# dimensions. The batch size is 256. Convolution filters contain 512 filters -# of size 3 x 3. We use stride size 1 and padding size 1 for the -# convolution. The following code defines the convolution algorithm in TVM. -# - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import numpy as np -import tvm -from tvm import te - -# The sizes of inputs and filters -batch = 256 -in_channel = 256 -out_channel = 512 -in_size = 14 -kernel = 3 -pad = 1 -stride = 1 - -# Algorithm -A = te.placeholder((in_size, in_size, in_channel, batch), name="A") -W = te.placeholder((kernel, kernel, in_channel, out_channel), name="W") -out_size = (in_size - kernel + 2 * pad) // stride + 1 -# Pad input -Apad = te.compute( - (in_size + 2 * pad, in_size + 2 * pad, in_channel, batch), - lambda yy, xx, cc, nn: tvm.tir.if_then_else( - tvm.tir.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad < in_size), - A[yy - pad, xx - pad, cc, nn], - tvm.tir.const(0.0, "float32"), - ), - name="Apad", -) -# Create reduction variables -rc = te.reduce_axis((0, in_channel), name="rc") -ry = te.reduce_axis((0, kernel), name="ry") -rx = te.reduce_axis((0, kernel), name="rx") -# Compute the convolution -B = te.compute( - (out_size, out_size, out_channel, batch), - lambda yy, xx, ff, nn: te.sum( - Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff], axis=[ry, rx, rc] - ), - name="B", -) - - -############################################################################### -# Memory Hierarchy -# ---------------- -# -# We first specify the memory hierarchy for buffers. The figure below shows the -# GPU memory hierarchy. One important difference from CPU memory hierarchy is -# that GPU provides a cache buffer called shared memory, which is managed by -# programmers. Thus how to maximize the data reuse in the shared memory is -# critical to achieve high performance in GPU kernels. -# -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/gpu_memory_hierarchy.png -# :align: center -# :height: 319px -# :width: 271px -# -# In this example, we load both Apad and W into buffer AA and WW, which are -# stored in the shared memory. These buffers will be later shared by all -# threads within the same thread block to compute the convolution. Each thread -# then loads its own part from shared buffer into their local registers, AL and -# WL. BL is a local cache of output B, which is also stored in the thread local -# registers. -# - -# Designate the memory hierarchy -s = te.create_schedule(B.op) -s[Apad].compute_inline() # compute Apad inline -AA = s.cache_read(Apad, "shared", [B]) -WW = s.cache_read(W, "shared", [B]) -AL = s.cache_read(AA, "local", [B]) -WL = s.cache_read(WW, "local", [B]) -BL = s.cache_write(B, "local") - -############################################################################### -# Blocking -# -------- -# -# The following code splits the workload into thread blocks and individual -# threads. We follow the blocking scheme in the matrix multiply. As shown in the -# figure below, given a pixel coordinate (y, x), a thread block is responsible -# for computing a region of block_factor x block_factor (64 x 64) for output -# channels and batch. Due to the limit of shared memory space, we only load step -# x block_factor (8 x 64) data from Apad and B each time to buffers in the -# shared memory. -# -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/conv_gpu_blocking.png -# :align: center -# :height: 308px -# :width: 317px -# - -# tile consts -tile = 8 -num_thread = 8 -block_factor = tile * num_thread -step = 8 -vthread = 2 - -# Get the GPU thread indices -block_x = te.thread_axis("blockIdx.x") -block_y = te.thread_axis("blockIdx.y") -block_z = te.thread_axis("blockIdx.z") -thread_x = te.thread_axis((0, num_thread), "threadIdx.x") -thread_y = te.thread_axis((0, num_thread), "threadIdx.y") -thread_xz = te.thread_axis((0, vthread), "vthread", name="vx") -thread_yz = te.thread_axis((0, vthread), "vthread", name="vy") - -# Split the workloads -hi, wi, fi, ni = s[B].op.axis -bz = s[B].fuse(hi, wi) -by, fi = s[B].split(fi, factor=block_factor) -bx, ni = s[B].split(ni, factor=block_factor) - -# Bind the iteration variables to GPU thread indices -s[B].bind(bz, block_z) -s[B].bind(by, block_y) -s[B].bind(bx, block_x) - -############################################################################### -# Virtual Thread Split -# -------------------- -# -# We further split the workload from a thread block to individual threads. To -# avoid *memory bank conflict*, we use virtual thread to split the area into 4 -# parts, and then tile into 8x8 grids. Therefore, shown in the figure below, -# each thread computes 4 strided grids, where size of each grid is 4 x 4. -# -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/conv_gpu_vthread.png -# :align: center -# :height: 188px -# :width: 268px -# - -tyz, fi = s[B].split(fi, nparts=vthread) # virtual thread split -txz, ni = s[B].split(ni, nparts=vthread) # virtual thread split -ty, fi = s[B].split(fi, nparts=num_thread) -tx, ni = s[B].split(ni, nparts=num_thread) -s[B].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni) - -s[B].bind(tyz, thread_yz) -s[B].bind(txz, thread_xz) -s[B].bind(ty, thread_y) -s[B].bind(tx, thread_x) - -############################################################################### -# Cooperative Fetching -# -------------------- -# -# As mentioned before, each time step we need to transfer step x block_factor -# data from GPU global memory to shared memory. In order to reduce the memory -# transfer per thread, the following code lets threads in the same thread block -# coopertively fetch dependent data from global memory. -# - - -# Schedule BL local write -s[BL].compute_at(s[B], tx) -yi, xi, fi, ni = s[BL].op.axis -ry, rx, rc = s[BL].op.reduce_axis -rco, rci = s[BL].split(rc, factor=step) -s[BL].reorder(rco, ry, rx, rci, fi, ni) - -# Attach computation to iteration variables -s[AA].compute_at(s[BL], rx) -s[WW].compute_at(s[BL], rx) -s[AL].compute_at(s[BL], rci) -s[WL].compute_at(s[BL], rci) - -# Schedule for A's shared memory load -yi, xi, ci, ni = s[AA].op.axis -ty, ci = s[AA].split(ci, nparts=num_thread) -tx, ni = s[AA].split(ni, nparts=num_thread) -_, ni = s[AA].split(ni, factor=4) -s[AA].reorder(ty, tx, yi, xi, ci, ni) -s[AA].bind(ty, thread_y) -s[AA].bind(tx, thread_x) -s[AA].vectorize(ni) # vectorize memory load - -# Schedule for W's shared memory load -yi, xi, ci, fi = s[WW].op.axis -ty, ci = s[WW].split(ci, nparts=num_thread) -tx, fi = s[WW].split(fi, nparts=num_thread) -_, fi = s[WW].split(fi, factor=4) -s[WW].reorder(ty, tx, yi, xi, ci, fi) -s[WW].bind(ty, thread_y) -s[WW].bind(tx, thread_x) -s[WW].vectorize(fi) # vectorize memory load - - -############################################################################### -# Generate CUDA Kernel -# -------------------- -# -# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the -# latency of convolution. -# - -func = tvm.build(s, [A, W, B], "cuda") -dev = tvm.cuda(0) -a_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype) -w_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype) -a = tvm.nd.array(a_np, dev) -w = tvm.nd.array(w_np, dev) -b = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), dev) -func(a, w, b) -evaluator = func.time_evaluator(func.entry_name, dev, number=1) -print("Convolution: %f ms" % (evaluator(a, w, b).mean * 1e3)) diff --git a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py b/gallery/how_to/optimize_operators/opt_conv_tensorcore.py deleted file mode 100644 index b43fac913956..000000000000 --- a/gallery/how_to/optimize_operators/opt_conv_tensorcore.py +++ /dev/null @@ -1,414 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _opt-conv-tensorcore: - -How to optimize convolution using TensorCores -============================================= -**Author**: `Siyuan Feng `_ - -In this tutorial, we will demonstrate how to write a high performance convolution -schedule using TensorCores in TVM. In this example, we assume the input to -convolution has a large batch. We strongly recommend covering the :ref:`opt-conv-gpu` tutorial first. - -""" - -################################################################ -# TensorCore Introduction -# ----------------------- -# Each Tensor Core provides a 4x4x4 matrix processing array that operates -# :code:`D = A * B + C`, where A, B, C and D are 4x4 matrices as Figure shows. -# The matrix multiplication inputs A and B are FP16 matrices, while the accumulation -# matrices C and D may be FP16 or FP32 matrices. -# -# However, CUDA programmers can only use warp-level primitive -# :code:`wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag)` to perform -# 16x16x16 half-precision matrix multiplication on tensor cores. Before invoking -# the matrix multiplication, programmers must load data from memory into registers -# with primitive :code:`wmma::load_matrix_sync`, explicitly. The NVCC compiler translates -# that primitive into multiple memory load instructions. At run time, every thread loads -# 16 elements from matrix A and 16 elements from B. - -################################################################ -# Preparation and Algorithm -# ------------------------- -# We use the fixed size for input tensors with 256 channels and 14 x 14 dimensions. -# The batch size is 256. Convolution filters contain 512 filters of size 3 x 3. -# We use stride size 1 and padding size 1 for the convolution. In the example, we use -# NHWCnc memory layout.The following code defines the convolution algorithm in TVM. - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import tvm -from tvm import te -import numpy as np -from tvm.contrib import nvcc - -# The sizes of inputs and filters -batch_size = 256 -height = 14 -width = 14 -in_channels = 256 -out_channels = 512 -kernel_h = 3 -kernel_w = 3 -pad_h = 1 -pad_w = 1 -stride_h = 1 -stride_w = 1 - -# TensorCore shape -block_size = 16 - -assert batch_size % block_size == 0 -assert in_channels % block_size == 0 -assert out_channels % block_size == 0 - -# Input feature map: (N, H, W, IC, n, ic) -data_shape = ( - batch_size // block_size, - height, - width, - in_channels // block_size, - block_size, - block_size, -) -# Kernel: (H, W, IC, OC, ic, oc) -kernel_shape = ( - kernel_h, - kernel_w, - in_channels // block_size, - out_channels // block_size, - block_size, - block_size, -) -# Output feature map: (N, H, W, OC, n, oc) -output_shape = ( - batch_size // block_size, - height, - width, - out_channels // block_size, - block_size, - block_size, -) - -# Reduction axes -kh = te.reduce_axis((0, kernel_h), name="kh") -kw = te.reduce_axis((0, kernel_w), name="kw") -ic = te.reduce_axis((0, in_channels // block_size), name="ic") -ii = te.reduce_axis((0, block_size), name="ii") - -# Algorithm -A = te.placeholder(data_shape, name="A", dtype="float16") -W = te.placeholder(kernel_shape, name="W", dtype="float16") -Apad = te.compute( - ( - batch_size // block_size, - height + 2 * pad_h, - width + 2 * pad_w, - in_channels // block_size, - block_size, - block_size, - ), - lambda n, h, w, i, nn, ii: tvm.tir.if_then_else( - tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width), - A[n, h - pad_h, w - pad_w, i, nn, ii], - tvm.tir.const(0.0, "float16"), - ), - name="Apad", -) -Conv = te.compute( - output_shape, - lambda n, h, w, o, nn, oo: te.sum( - Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32") - * W[kh, kw, ic, o, ii, oo].astype("float32"), - axis=[ic, kh, kw, ii], - ), - name="Conv", -) - -s = te.create_schedule(Conv.op) -s[Apad].compute_inline() - -############################################################################### -# Memory Scope -# ------------ -# In traditional GPU schedule, we have global, shared and local memory scope. -# To support TensorCores, we add another three special memory scope: :code:`wmma.matrix_a`, -# :code:`wmma.matrix_b` and :code:`wmma.accumulator`. On hardware, all fragments scope -# stores at the on-chip registers level, the same place with local memory. - -# Designate the memory hierarchy -AS = s.cache_read(Apad, "shared", [Conv]) -WS = s.cache_read(W, "shared", [Conv]) -AF = s.cache_read(AS, "wmma.matrix_a", [Conv]) -WF = s.cache_read(WS, "wmma.matrix_b", [Conv]) -ConvF = s.cache_write(Conv, "wmma.accumulator") - -############################################################################### -# Define Tensor Intrinsic -# ----------------------- -# In fact, TensorCore is a special hardware operation. So, we can just use tensorize -# to replace a unit of computation with the TensorCore instruction. The first thing is -# that we need to define tensor intrinsic. -# -# There are four basic operation in TensorCore: :code:`fill_fragment`, :code:`load_matrix`, -# :code:`mma_sync` and :code:`store_matrix`. Since :code:`fill_fragment` and :code:`mma_sync` -# are both used in matrix multiplication, so we can just write following three intrinsics. - - -def intrin_wmma_load_matrix(scope): - n = 16 - A = te.placeholder((n, n), name="A", dtype="float16") - BA = tvm.tir.decl_buffer(A.shape, A.dtype, scope="shared", data_alignment=32, offset_factor=256) - C = te.compute((n, n), lambda i, j: A[i, j], name="C") - BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope=scope, data_alignment=32, offset_factor=256) - - def intrin_func(ins, outs): - ib = tvm.tir.ir_builder.create() - - BA = ins[0] - BC = outs[0] - ib.emit( - tvm.tir.call_intrin( - "handle", - "tir.tvm_load_matrix_sync", - BC.data, - n, - n, - n, - BC.elem_offset // 256, - BA.access_ptr("r"), - n, - "row_major", - ) - ) - return ib.get() - - return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC}) - - -def intrin_wmma_gemm(): - n = 16 - A = te.placeholder((n, n), name="A", dtype="float16") - B = te.placeholder((n, n), name="B", dtype="float16") - k = te.reduce_axis((0, n), name="k") - C = te.compute( - (n, n), - lambda ii, jj: te.sum(A[ii, k].astype("float") * B[k, jj].astype("float"), axis=k), - name="C", - ) - BA = tvm.tir.decl_buffer( - A.shape, A.dtype, name="BA", scope="wmma.matrix_a", data_alignment=32, offset_factor=256 - ) - BB = tvm.tir.decl_buffer( - B.shape, B.dtype, name="BB", scope="wmma.matrix_b", data_alignment=32, offset_factor=256 - ) - BC = tvm.tir.decl_buffer( - C.shape, C.dtype, name="BC", scope="wmma.accumulator", data_alignment=32, offset_factor=256 - ) - - def intrin_func(ins, outs): - BA, BB = ins - (BC,) = outs - - def init(): - ib = tvm.tir.ir_builder.create() - ib.emit( - tvm.tir.call_intrin( - "handle", "tir.tvm_fill_fragment", BC.data, n, n, n, BC.elem_offset // 256, 0.0 - ) - ) - return ib.get() - - def update(): - ib = tvm.tir.ir_builder.create() - ib.emit( - tvm.tir.call_intrin( - "handle", - "tir.tvm_mma_sync", - BC.data, - BC.elem_offset // 256, - BA.data, - BA.elem_offset // 256, - BB.data, - BB.elem_offset // 256, - BC.data, - BC.elem_offset // 256, - ) - ) - return ib.get() - - return update(), init(), update() - - return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC}) - - -def intrin_wmma_store_matrix(): - n = 16 - A = te.placeholder((n, n), name="A", dtype="float32") - BA = tvm.tir.decl_buffer( - A.shape, A.dtype, scope="wmma.accumulator", data_alignment=32, offset_factor=256 - ) - C = te.compute((n, n), lambda i, j: A[i, j], name="C") - BC = tvm.tir.decl_buffer(C.shape, C.dtype, scope="global", data_alignment=32, offset_factor=256) - - def intrin_func(ins, outs): - ib = tvm.tir.ir_builder.create() - BA = ins[0] - BC = outs[0] - ib.emit( - tvm.tir.call_intrin( - "handle", - "tir.tvm_store_matrix_sync", - BA.data, - n, - n, - n, - BA.elem_offset // 256, - BC.access_ptr("w"), - n, - "row_major", - ) - ) - return ib.get() - - return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, C: BC}) - - -############################################################################### -# Scheduling the Computation -# -------------------------- -# To use TensorCores in TVM, we must schedule the computation into specific structure -# to match the tensor intrinsic. The same as traditional GPU programs, we can also use -# shared memory to boost the speed. If you have any questions about blocking and shared -# memory, please refer :ref:`opt-conv-gpu`. -# -# In this example, each block contains 2x4 warps, and each warp calls 4x2 TensorCore -# instructions. Thus, the output shape of each warp is 64x32 and each block outputs -# 128x128 titles. Due to the limit of shared memory space, we only load 2 blocks (2x128x128 tiles) -# one time. -# -# .. note:: -# -# *Warp-level Operation* -# -# Note that all TensorCore instructions are warp-level instructions, which means all 32 threads -# in a warp should do this instruction simultaneously. Making threadIdx.x extent=32 is one of the -# easiest way to solve this. Then We can bind threadIdx.x to any loops except those contain -# TensorCore intrinsics directly or indirectly. Also note that it is not the unique solution. -# The only thing we should do is to make sure all threads in a warp can call TensorCore at the same time. - -# Define tiling sizes -block_row_warps = 4 -block_col_warps = 2 -warp_row_tiles = 2 -warp_col_tiles = 4 -warp_size = 32 -chunk = 2 - -block_x = te.thread_axis("blockIdx.x") -block_y = te.thread_axis("blockIdx.y") -block_z = te.thread_axis("blockIdx.z") -thread_x = te.thread_axis("threadIdx.x") -thread_y = te.thread_axis("threadIdx.y") -thread_z = te.thread_axis("threadIdx.z") - -nc, hc, wc, oc, nnc, ooc = Conv.op.axis -block_k = s[Conv].fuse(hc, wc) -s[Conv].bind(block_k, block_z) -nc, nci = s[Conv].split(nc, factor=warp_row_tiles) -block_i, nc = s[Conv].split(nc, factor=block_row_warps) -oc, oci = s[Conv].split(oc, factor=warp_col_tiles) -block_j, oc = s[Conv].split(oc, factor=block_col_warps) -s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc) -s[Conv].bind(block_i, block_x) -s[Conv].bind(block_j, block_y) -s[Conv].bind(nc, thread_y) -s[Conv].bind(oc, thread_z) - -# Schedule local computation -s[ConvF].compute_at(s[Conv], oc) -n, h, w, o, nnf, oof = ConvF.op.axis -ko, ki = s[ConvF].split(ic, factor=chunk) -s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii) - -# Move intermediate computation into each output compute tile -s[AF].compute_at(s[ConvF], kw) -s[WF].compute_at(s[ConvF], kw) - -# Schedule for A's share memory -s[AS].compute_at(s[ConvF], kh) -n, h, w, i, nn, ii = AS.op.axis -tx, xo = s[AS].split(n, nparts=block_row_warps) -ty, yo = s[AS].split(xo, nparts=block_col_warps) -t = s[AS].fuse(nn, ii) -to, ti = s[AS].split(t, factor=warp_size) -s[AS].bind(tx, thread_y) -s[AS].bind(ty, thread_z) -s[AS].bind(ti, thread_x) - -# Schedule for W's share memory -s[WS].compute_at(s[ConvF], kh) -kh, kw, ic, o, ii, oo = WS.op.axis -tx, xo = s[WS].split(o, nparts=block_row_warps) -ty, yo = s[WS].split(xo, nparts=block_col_warps) -t = s[WS].fuse(ii, oo) -to, ti = s[WS].split(t, nparts=warp_size) -s[WS].bind(tx, thread_y) -s[WS].bind(ty, thread_z) -s[WS].bind(to, thread_x) -s[WS].vectorize(ti) -print(tvm.lower(s, [A, W, Conv], simple_mode=True)) - -############################################################################### -# Lowering Computation to Intrinsics -# ---------------------------------- -# The last phase is to lower the computation loops down to TensorCore hardware intrinsics -# by mapping the 2D convolution to tensor intrinsics - -s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix("wmma.matrix_a")) -s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix("wmma.matrix_b")) -s[Conv].tensorize(nnc, intrin_wmma_store_matrix()) -s[ConvF].tensorize(nnf, intrin_wmma_gemm()) -print(tvm.lower(s, [A, W, Conv], simple_mode=True)) - -############################################################################### -# Generate CUDA Kernel -# -------------------- -# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the latency of convolution. -# Since TensorCores are only supported in NVIDIA GPU with Compute Capability 7.0 or higher, it may not -# be able to run on our build server - -dev = tvm.cuda(0) -if nvcc.have_tensorcore(dev.compute_version): - with tvm.transform.PassContext(config={"tir.UnrollLoop": {"auto_max_step": 16}}): - func = tvm.build(s, [A, W, Conv], "cuda") - a_np = np.random.uniform(size=data_shape).astype(A.dtype) - w_np = np.random.uniform(size=kernel_shape).astype(W.dtype) - a = tvm.nd.array(a_np, dev) - w = tvm.nd.array(w_np, dev) - c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev) - evaluator = func.time_evaluator(func.entry_name, dev, number=10) - print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3)) - -############################################################################### -# Summary -# ------- -# This tutorial demonstrates how TVM scheduling primitives can be used to -# call TensorCores on specific GPUs. diff --git a/gallery/how_to/optimize_operators/opt_gemm.py b/gallery/how_to/optimize_operators/opt_gemm.py deleted file mode 100644 index 7ca423281570..000000000000 --- a/gallery/how_to/optimize_operators/opt_gemm.py +++ /dev/null @@ -1,394 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _opt-gemm: - -How to optimize GEMM on CPU -=========================== -**Author**: `Jian Weng `_, \ - `Ruofei Yu `_ - -(TL;DR) TVM provides abstract interfaces which allows users to depict an algorithm and the -algorithm's implementing organization (the so-called schedule) separately. Typically, writing -algorithm in high-performance schedule breaks the algorithm's readability and modularity. Also, -trying various seemingly promising schedules is time-consuming. With the help of TVM, we can -try these schedules efficiently to enhance the performance. - -In this tutorial, we will demonstrate how to use TVM to optimize square matrix multiplication -and achieve 200 times faster than baseline by simply adding 18 extra lines of code. - -There are two important optimizations on intense computation applications executed on CPU: - 1. Increase the cache hit rate of memory access. Both complex numerical computation and hot-spot - memory access can be accelerated from high cache hit rate. This requires us to transform the - origin memory access pattern to the pattern fits the cache policy. - 2. SIMD (Single instruction multi-data), or we call it vector processing unit. Every time, a - small batch of data, rather than a single grid, will be processed. This requires us to - transform the data access pattern in the loop body in uniform pattern so that the LLVM - backend can lower it to SIMD. - -Actually, all the methodologies used in this tutorial is a subset of tricks mentioned in this -`repo `_. Some of them have been applied by TVM -abstraction automatically, but some of them cannot be simply applied due to TVM constraints. - -All the experiment results mentioned below, are executed on 2015's 15' MacBook equipped with -Intel i7-4770HQ CPU. The cache line size should be 64 bytes for all the x86 CPUs. -""" - - -################################################################################################ -# Preparation and Baseline -# ------------------------ -# In this tutorial, we will demo how to use TVM to optimize matrix multiplication. -# Before actually demonstrating, we first define these variables. -# Then we write a baseline implementation, the simplest way to write a matrix multiplication in TVM. - -import tvm -import tvm.testing -from tvm import te -import numpy -import timeit - -# The size of the matrix -# (M, K) x (K, N) -# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL. -M = 1024 -K = 1024 -N = 1024 - -# The default tensor type in tvm -dtype = "float32" - -# using Intel AVX2(Advanced Vector Extensions) ISA for SIMD -# To get the best performance, please change the following line -# to llvm -mcpu=core-avx2, or specific type of CPU you use -target = "llvm" -dev = tvm.device(target, 0) - -# Random generated tensor for testing -a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), dev) -b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), dev) - -np_repeat = 100 -np_runing_time = timeit.timeit( - setup="import numpy\n" - "M = " + str(M) + "\n" - "K = " + str(K) + "\n" - "N = " + str(N) + "\n" - 'dtype = "float32"\n' - "a = numpy.random.rand(M, K).astype(dtype)\n" - "b = numpy.random.rand(K, N).astype(dtype)\n", - stmt="answer = numpy.dot(a, b)", - number=np_repeat, -) -print("Numpy running time: %f" % (np_runing_time / np_repeat)) - -answer = numpy.dot(a.numpy(), b.numpy()) - -# Algorithm -k = te.reduce_axis((0, K), "k") -A = te.placeholder((M, K), name="A") -B = te.placeholder((K, N), name="B") -C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C") - -# Default schedule -s = te.create_schedule(C.op) -func = tvm.build(s, [A, B, C], target=target, name="mmult") -assert func - -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) -func(a, b, c) -tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - -evaluator = func.time_evaluator(func.entry_name, dev, number=1) -print("Baseline: %f" % evaluator(a, b, c).mean) - -################################################################################################ -# In TVM, we can always inspect lower level IR to debug or optimize our schedule. -# Here is the generated IR using our baseline schedule. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################################ -# Blocking -# -------- -# A important trick to enhance the cache hit rate is blocking --- data chunk will be computed -# block by block. The memory access inside the block is a small neighbourhood which is with high -# memory locality. In this tutorial, I picked up 32 as the blocking factor. So the block will -# fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB (L1 data cache) - -bn = 32 -kfactor = 4 -s = te.create_schedule(C.op) - -# Blocking by loop tiling -mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(kaxis,) = s[C].op.reduce_axis -ko, ki = s[C].split(kaxis, factor=kfactor) - -# Hoist reduction domain outside the blocking loop -s[C].reorder(mo, no, ko, ki, mi, ni) - -func = tvm.build(s, [A, B, C], target=target, name="mmult") -assert func - -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) -func(a, b, c) -tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - -# By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops, -# we can see big speedup compared with the baseline. -evaluator = func.time_evaluator(func.entry_name, dev, number=10) -print("Opt1: %f" % evaluator(a, b, c).mean) - -################################################################################################ -# Here is the generated IR after blocking. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################################### -# Vectorization -# ------------- -# Another important trick is vectorization. When the memory access pattern is uniform, -# the compiler can detect this pattern and pass the continuous memory to vector processor. In TVM, -# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it -# vastly. -# -# In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly. - -s = te.create_schedule(C.op) -mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(kaxis,) = s[C].op.reduce_axis -ko, ki = s[C].split(kaxis, factor=kfactor) - -s[C].reorder(mo, no, ko, ki, mi, ni) - -# Vectorization -s[C].vectorize(ni) - -func = tvm.build(s, [A, B, C], target=target, name="mmult") -assert func - -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) -func(a, b, c) -tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - -evaluator = func.time_evaluator(func.entry_name, dev, number=10) -print("Opt2: %f" % evaluator(a, b, c).mean) - -################################################################################################ -# Here is the generated IR after vectorization. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################################### -# Loop Permutation -# ---------------- -# If we look at the above IR, we can see the inner loop row data is vectorized for both B and C. -# Next we will look at the access pattern of A. In current schedule, A is accessed column by column -# which is not cache friendly. If we change the nested loop order of ki and inner axes mi, -# the access pattern for A matrix is more cache friendly. - -s = te.create_schedule(C.op) -mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(kaxis,) = s[C].op.reduce_axis -ko, ki = s[C].split(kaxis, factor=kfactor) - -# re-ordering -s[C].reorder(mo, no, ko, mi, ki, ni) -s[C].vectorize(ni) - -func = tvm.build(s, [A, B, C], target=target, name="mmult") -assert func - -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) -func(a, b, c) -tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - -evaluator = func.time_evaluator(func.entry_name, dev, number=10) -print("Opt3: %f" % evaluator(a, b, c).mean) - -################################################################################################ -# Here is the generated IR after loop permutation. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################################### -# Array Packing -# ------------- -# Another important trick is array packing. The trick is to reorder the storage of a multi- -# dimensional array so that it is accessed sequentially after it is flattened and stored in one- -# dimensional memory. -# -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png -# :align: center -# -# NOTE: This figure is a general illustration of how array packing works. - - -################################################################################################### -# We can use array packing to address the access pattern for B. Observe the array access pattern of -# B after flattening which is not sequential as we iterate over the K dimension. We can reorder B -# with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and -# also the vector size for B in the inner loop. This reorder splits N into two dimensions --- -# bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B -# from outer to inner loops (no, ko, ki, ni) resulting in a sequential access pattern for B after -# flattening. - - -# We have to re-write the algorithm slightly. -packedB = te.compute( - (N / bn, K, bn), lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB" -) -C = te.compute( - (M, N), - lambda m, n: te.sum(A[m, k] * packedB[n // bn, k, tvm.tir.indexmod(n, bn)], axis=k), - name="C", -) - -s = te.create_schedule(C.op) - -mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(kaxis,) = s[C].op.reduce_axis -ko, ki = s[C].split(kaxis, factor=kfactor) - -s[C].reorder(mo, no, ko, mi, ki, ni) -s[C].vectorize(ni) - -bigN, _, littleN = s[packedB].op.axis -s[packedB].vectorize(littleN) -s[packedB].parallel(bigN) - -func = tvm.build(s, [A, B, C], target=target, name="mmult") -assert func - -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) -func(a, b, c) -tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - -evaluator = func.time_evaluator(func.entry_name, dev, number=10) -print("Opt4: %f" % evaluator(a, b, c).mean) - -################################################################################################ -# Here is the generated IR after array packing. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################################ -# Write cache for blocks -# ---------------------- -# After blocking, the program will write result to C block by block, the access pattern -# is not sequential. So we can use a sequential cache array to hold the block results and -# write to C when all the block results are ready. -# - -s = te.create_schedule(C.op) - -# Allocate write cache -CC = s.cache_write(C, "global") - -mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) - -# Write cache is computed at no -s[CC].compute_at(s[C], no) - -# New inner axes -mc, nc = s[CC].op.axis - -(kaxis,) = s[CC].op.reduce_axis -ko, ki = s[CC].split(kaxis, factor=kfactor) -s[CC].reorder(ko, mc, ki, nc) -s[CC].vectorize(nc) - -# TODO: Add separate optimization step to discuss loop unrolling -# unrolling is a loop optimization strategy which can reduce branch -# prediction failures and increases the chance of concurrent execution -# unroll kfactor loops -s[CC].unroll(ki) - -bigN, _, littleN = s[packedB].op.axis -s[packedB].vectorize(littleN) -s[packedB].parallel(bigN) - -func = tvm.build(s, [A, B, C], target=target, name="mmult") -assert func - -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) -func(a, b, c) -tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - -evaluator = func.time_evaluator(func.entry_name, dev, number=10) -print("Opt5: %f" % evaluator(a, b, c).mean) - -################################################################################################ -# Here is the generated IR after blocking. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################################### -# Parallel -# -------- -# Furthermore, we can also utilize multi-core processors to do the thread-level parallelization. - -s = te.create_schedule(C.op) - -CC = s.cache_write(C, "global") - -mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) - -s[CC].compute_at(s[C], no) - -mc, nc = s[CC].op.axis - -(kaxis,) = s[CC].op.reduce_axis -ko, ki = s[CC].split(kaxis, factor=kfactor) -s[CC].reorder(ko, mc, ki, nc) -s[CC].vectorize(nc) -s[CC].unroll(ki) - -# parallel -s[C].parallel(mo) - -bigN, _, littleN = s[packedB].op.axis -s[packedB].vectorize(littleN) -s[packedB].parallel(bigN) - -func = tvm.build(s, [A, B, C], target=target, name="mmult") -assert func - -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) -func(a, b, c) -tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - -evaluator = func.time_evaluator(func.entry_name, dev, number=50) -opt6_time = evaluator(a, b, c).mean -print("Opt6: %f" % opt6_time) - -################################################################################################ -# Here is the generated IR after parallelization. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################################### - -################################################################################################## -# Summary -# ------- -# After applying the above simple optimizations with only 18 lines of code, -# our generated code can achieve 60% of the `numpy` performance with MKL. -# Note that the outputs on the web page reflect the running times on a non-exclusive -# Docker container, thereby they are *unreliable*. It is highly encouraged to run the -# tutorial by yourself to observe the performance gain achieved by TVM. diff --git a/gallery/how_to/tune_with_autoscheduler/README.txt b/gallery/how_to/tune_with_autoscheduler/README.txt deleted file mode 100644 index 636a636d81c0..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -Use AutoScheduler for Template-Free Scheduling ----------------------------------------------- - -The TVM AutoScheduler offers a template-free way to tune models. These how-tos -demonstrate how to tune a variety of different models to target a number of -common platforms. diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/conv2d.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/conv2d.json deleted file mode 100644 index c748920d14db..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/ci_logs/conv2d.json +++ /dev/null @@ -1,2 +0,0 @@ -# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI. -{"i": [["[\"conv2d_layer\", 1, 7, 7, 512, 512, 3, 3, [1, 1], [1, 1]]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32"], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 512, [1, 64, 2, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 7, [1, 1, 7, 1], 1], ["SP", 3, 20, 512, [4, 2], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 48, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 504, [4], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000429498], 0, 1.59126, 1603259147], "v": "v0.2"} diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json deleted file mode 100644 index b0d33a911a63..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/ci_logs/matmul.json +++ /dev/null @@ -1,2 +0,0 @@ -# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI. -{"i": [["[\"matmul_add\", 1024, 1024, 1024, \"float32\"]", "llvm -keys=cpu", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1024, [2, 1, 4], 1], ["SP", 2, 4, 1024, [1, 1, 8], 1], ["SP", 2, 8, 1024, [4], 1], ["RE", 2, [0, 4, 1, 5, 8, 2, 6, 9, 3, 7]], ["FSP", 4, 0, 0, 2], ["FSP", 4, 3, 1, 2], ["RE", 4, [0, 3, 1, 4, 2, 5]], ["CA", 2, 4, 3], ["FU", 4, [0, 1]], ["AN", 4, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$8"], ["AN", 2, 9, 2], ["AN", 4, 4, 2]]]], "r": [[0.0044742], 0, 0.335558, 1607112214], "v": "v0.3"} diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json deleted file mode 100644 index 6f33ebef1a7b..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-18-NHWC-B1-cuda.json +++ /dev/null @@ -1,24 +0,0 @@ -{"i": [["[\"1097323f3970e5c881ad3a0028ca79cb\", [1, 14, 14, 256], [4, 4, 256, 256], [1, 1, 1, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [49], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 7, 1], 1], ["SP", 6, 15, 256, [1, 8, 1, 2], 1], ["SP", 6, 20, 256, [4, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [8], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[4.47561e-05], 0, 1.55357, 1691392766], "v": "v0.6"} -{"i": [["[\"2d10de6646307f0e3e5cf4b31c20e69b\", [1, 56, 56, 64], [1, 1, 64, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [2, 1, 1, 1], 1], ["SP", 3, 10, 56, [1, 2, 2, 1], 1], ["SP", 3, 15, 64, [2, 16, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 4, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 8, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[7.78139e-06], 0, 1.03735, 1691392799], "v": "v0.6"} -{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 56, 56, 64], [3, 3, 64, 128], [1, 1, 1, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 14, 1, 2], 1], ["SP", 3, 10, 28, [2, 2, 1, 1], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [1, 1], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 64, [8, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 120, [2], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[6.99329e-05], 0, 1.30771, 1691392847], "v": "v0.6"} -{"i": [["[\"00a059b856ac30ac172b6252254479a6\", [1, 512], [1000, 512], [1, 1000], [1, 1000]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [2, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 16, [2], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 16, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.73674e-05], 0, 1.14621, 1691392870], "v": "v0.6"} -{"i": [["[\"8c53ca2904398da2889aa7508082d7bb\", [1, 7, 7, 512], [1, 1, 1, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [32], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [4], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$0"]]]], "r": [[3.91348e-06], 0, 0.95337, 1691392900], "v": "v0.6"} -{"i": [["[\"d78e8eb6021c4cdda0ad7775d10f751a\", [1, 7, 7, 512], [4, 4, 512, 512], [1, 7, 7, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 2, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 4, 2], 1], ["SP", 6, 15, 512, [2, 32, 1, 1], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [8], 1], ["SP", 4, 4, 512, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[8.90576e-05], 0, 1.97489, 1691392966], "v": "v0.6"} -{"i": [["[\"0fad1b42d0d33418e0a8d15d3bbad3c9\", [1, 14, 14, 256], [1, 1, 256, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [2, 8, 4, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [16, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 52, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[9.47321e-06], 0, 1.20995, 1691393001], "v": "v0.6"} -{"i": [["[\"25577781e50c611c2e45e73c1cb3a6ca\", [1, 28, 28, 128], [4, 4, 128, 128], [1, 28, 28, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [28], 1], ["SP", 8, 4, 128, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [2, 1, 1, 1], 1], ["SP", 6, 5, 4, [2, 1, 1, 1], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [1, 4, 2, 4], 1], ["SP", 6, 20, 128, [1, 16], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000120214], 0, 1.54604, 1691393074], "v": "v0.6"} -{"i": [["[\"40b1cf1fd37b0ef111b3cc0247302508\", [1, 7, 7, 512], [4, 4, 512, 512], [1, 1, 1, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [2, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 1, 2], 1], ["SP", 6, 15, 512, [1, 32, 1, 2], 1], ["SP", 6, 20, 512, [4, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000101111], 0, 2.149, 1691393145], "v": "v0.6"} -{"i": [["[\"f19692ed81d032b1697c08adee62f9a5\", [1, 28, 28, 128], [4, 4, 128, 128], [1, 28, 28, 128], [1, 1, 1, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [2, 7, 1, 7], 1], ["SP", 6, 15, 128, [2, 32, 2, 1], 1], ["SP", 6, 20, 128, [8, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[5.94487e-05], 0, 2.31076, 1691393218], "v": "v0.6"} -{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 224, 224, 3], [7, 7, 3, 64], [1, 1, 1, 64], [1, 112, 112, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 4, 2, 1], 1], ["SP", 3, 10, 112, [7, 1, 1, 1], 1], ["SP", 3, 15, 64, [1, 8, 2, 4], 1], ["SP", 3, 20, 7, [1, 1], 1], ["SP", 3, 23, 7, [1, 1], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 24, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 9, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$1024"]]]], "r": [[6.31052e-05], 0, 1.71021, 1691393271], "v": "v0.6"} -{"i": [["[\"0fad1b42d0d33418e0a8d15d3bbad3c9\", [1, 56, 56, 64], [1, 1, 64, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 14, 1], 1], ["SP", 3, 10, 28, [2, 2, 1, 1], 1], ["SP", 3, 15, 128, [2, 32, 1, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 2, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 54, [3], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.69842e-05], 0, 1.2583, 1691393301], "v": "v0.6"} -{"i": [["[\"6d012ba18a086c11ee2b85c7324e16f2\", [1, 112, 112, 64], [1, 1, 1, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$512"]]]], "r": [[8.70322e-06], 0, 1.10916, 1691393368], "v": "v0.6"} -{"i": [["[\"0bcf718c0e6566bcd6c3b1437a3b6291\", [1, 28, 28, 128], [4, 4, 128, 128], [1, 1, 1, 128], [1, 28, 28, 128]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 128, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [2, 1, 1, 2], 1], ["SP", 6, 10, 196, [1, 7, 4, 1], 1], ["SP", 6, 15, 128, [1, 16, 2, 2], 1], ["SP", 6, 20, 128, [8, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 128, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000176353], 0, 2.03733, 1691393450], "v": "v0.6"} -{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 14, 14, 256], [3, 3, 256, 512], [1, 1, 1, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 7, 1, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [1, 8, 2, 2], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [3, 1], 1], ["SP", 3, 26, 256, [1, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 180, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[7.07803e-05], 0, 1.30485, 1691393484], "v": "v0.6"} -{"i": [["[\"7d79c516e212fe1d73f5dbb90eaca2cf\", [1, 1000], [1, 1000]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["SP", 4, 1, 1000, [40], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["AN", 4, 0, 5], ["AN", 1, 0, 6], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 3, 0, "auto_unroll_max_step$1024"]]]], "r": [[1.08259e-05], 0, 1.08282, 1691393496], "v": "v0.6"} -{"i": [["[\"64b7ce5264a64cb340d78b444b0325e6\", [1, 14, 14, 256], [4, 4, 256, 256], [1, 14, 14, 256], [1, 1, 1, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 4], 1], ["SP", 6, 5, 4, [1, 1, 2, 1], 1], ["SP", 6, 10, 49, [1, 1, 1, 7], 1], ["SP", 6, 15, 256, [1, 32, 1, 2], 1], ["SP", 6, 20, 256, [1, 16], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [1], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 448, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000135607], 0, 1.73571, 1691394110], "v": "v0.6"} -{"i": [["[\"10b8215aaf2e14d47d40b4093e6f41a0\", [1, 56, 56, 64], [6, 6, 64, 64], [1, 56, 56, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [28], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 2], 1], ["SP", 6, 5, 6, [1, 1, 2, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 1], 1], ["SP", 6, 15, 64, [1, 32, 2, 1], 1], ["SP", 6, 20, 64, [4, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [49], 1], ["SP", 4, 4, 64, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 96, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 48, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[5.44152e-05], 0, 3.94918, 1691393659], "v": "v0.6"} -{"i": [["[\"0fad1b42d0d33418e0a8d15d3bbad3c9\", [1, 28, 28, 128], [1, 1, 128, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [2, 7, 1, 1], 1], ["SP", 3, 10, 14, [1, 2, 7, 1], 1], ["SP", 3, 15, 256, [1, 16, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [8, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [4], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[1.89493e-05], 0, 1.10813, 1691393688], "v": "v0.6"} -{"i": [["[\"7f3fee61bc3c2604395f5d343b840b7c\", [1, 14, 14, 256], [4, 4, 256, 256], [1, 14, 14, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [49], 1], ["SP", 8, 4, 256, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 49, 1, 1], 1], ["SP", 6, 15, 256, [2, 1, 2, 4], 1], ["SP", 6, 20, 256, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [2], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[6.70662e-05], 0, 1.51013, 1691393757], "v": "v0.6"} -{"i": [["[\"07f9fcad27bdd3233f86fe35a5185d33\", [1, 28, 28, 128], [3, 3, 128, 256], [1, 1, 1, 256], [1, 14, 14, 256]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 7, 2, 1], 1], ["SP", 3, 10, 14, [1, 1, 2, 7], 1], ["SP", 3, 15, 256, [4, 16, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [2, 4], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 24, [3], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 696, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000216208], 0, 2.42581, 1691393795], "v": "v0.6"} -{"i": [["[\"6c4f6234946e16bcf9e48bdf289f9200\", [1, 56, 56, 64], [6, 6, 64, 64], [1, 56, 56, 64], [1, 1, 1, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [32], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 1], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 64, [2, 8, 2, 1], 1], ["SP", 6, 20, 64, [8, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 64, [1], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[4.28227e-05], 0, 3.02731, 1691393870], "v": "v0.6"} -{"i": [["[\"7c2a4f1f432f81c44985590780dfb52d\", [1, 56, 56, 64], [6, 6, 64, 64], [1, 1, 1, 64], [1, 56, 56, 64]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 6], 1], ["SP", 6, 10, 196, [2, 7, 1, 1], 1], ["SP", 6, 15, 64, [1, 16, 1, 1], 1], ["SP", 6, 20, 64, [4, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [4], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[3.799e-05], 0, 3.63637, 1691393960], "v": "v0.6"} -{"i": [["[\"a3df19e5b88592ef5a9ce584a1ca3010\", [1, 7, 7, 512], [4, 4, 512, 512], [1, 7, 7, 512], [1, 1, 1, 512], [1, 1, 1, 512], [1, 7, 7, 512]]", "cuda -keys=cuda,gpu -arch=sm_86 -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0, []], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [2, 1, 1, 1], 1], ["SP", 6, 10, 16, [1, 8, 2, 1], 1], ["SP", 6, 15, 512, [4, 2, 1, 2], 1], ["SP", 6, 20, 512, [4, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [4], 1], ["SP", 4, 4, 512, [1], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[5.86461e-05], 0, 1.92621, 1691394032], "v": "v0.6"} diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json deleted file mode 100644 index 4fb148c887bd..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/ci_logs/resnet-50-NHWC-B1-llvm.json +++ /dev/null @@ -1,28 +0,0 @@ -# Provide valid schedules for resnet-50 for CPU. -# This is used to run the tutorial on the documentation web server. -{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.5"} -{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 2048, 1, 1, 1, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.5"} -{"i": [["[\"875556d12d0be2269206a7775d5296a6\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 1, 1, 2048, 1, 1, 1, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.5"} -{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 14, 14, 1024, 1, 1, 1024, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.5"} -{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 1, 1, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.5"} -{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 28, 28, 512, 1, 1, 512, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.5"} -{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 1, 1, 512, 1, 28, 28, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.5"} -{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 56, 56, 256, 1, 1, 256, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.5"} -{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.5"} -{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.5"} -{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 14, 14, 1024, 1, 1, 1024, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.5"} -{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.5"} -{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.5"} -{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 256, 1, 1, 256, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.5"} -{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 56, 56, 64, 3, 3, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.5"} -{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 7, 7, 2048, 1, 1, 2048, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.5"} -{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 56, 56, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.5"} -{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 512, 1, 1, 512, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.5"} -{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 256, 1, 1, 256, 512, 1, 28, 28, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.5"} -{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 1024, 1, 1, 1024, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.5"} -{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 7, 7, 512, 3, 3, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.5"} -{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.5"} -{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.5"} -{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 28, 28, 512]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.5"} -{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 1, 1, 256, 1, 56, 56, 256]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.5"} -{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 28, 28, 512, 1, 1, 512, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.5"} diff --git a/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json b/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json deleted file mode 100644 index 9bf6af0b17d8..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/ci_logs/sparse_dense.json +++ /dev/null @@ -1,2 +0,0 @@ -# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI. -{"i": [["[\"sparse_dense\", 512, 512, 512, [9831, 16, 1], [9831], [33], \"float32\"]", "llvm -keys=cpu", [6, 64, 64, 0, 0, 0, 0, 0], "", 1, ["sparse_dense_bsr_512_512_512_16_1_0.60_W_data", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indices", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indptr"]], [[], [["CI", 8], ["CI", 6], ["SP", 5, 0, 512, [1, 8], 1], ["FSP", 9, 0, 2, 1], ["SP", 5, 3, 32, [32], 1], ["FSP", 9, 2, 4, 1], ["RE", 5, [0, 3, 1, 4, 6, 2, 5, 7]], ["RE", 9, [0, 2, 1, 3]], ["CA", 5, 9, 1], ["CI", 4], ["FU", 9, [0, 1]], ["AN", 9, 0, 3], ["PR", 5, 0, "auto_unroll_max_step$0"], ["AN", 9, 2, 2]]]], "r": [[0.000957008], 0, 0.605709, 1614689820], "v": "v0.6"} diff --git a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py deleted file mode 100644 index 09dcd020d32c..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.py +++ /dev/null @@ -1,216 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _auto-scheduler-conv-gpu: - -Auto-scheduling a Convolution Layer for GPU -=========================================== -**Author**: `Lianmin Zheng `_, \ - `Chengfan Jia `_ - -This is a tutorial on how to use the auto-scheduler for GPUs. - -Different from the template-based :ref:`autotvm ` which relies on -manual templates to define the search space, the auto-scheduler does not require any templates. -Users only need to write the computation declaration without any schedule commands or templates. -The auto-scheduler can automatically generate a large search space and -find a good schedule in the space. - -We use a convolution layer as an example in this tutorial. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import os - -import numpy as np -import tvm -from tvm import te, auto_scheduler, topi -from tvm.topi.testing import conv2d_nchw_python - -###################################################################### -# Define the computation -# ^^^^^^^^^^^^^^^^^^^^^^ -# To begin with, let us define the computation of a convolution layer. -# The function should return the list of input/output tensors. -# From these tensors, the auto-scheduler can get the whole computational graph. - - -@auto_scheduler.register_workload -def conv2d_layer(N, H, W, CO, CI, KH, KW, stride, padding): - data = te.placeholder((N, CI, H, W), name="data") - kernel = te.placeholder((CO, CI, KH, KW), name="kernel") - bias = te.placeholder((1, CO, 1, 1), name="bias") - conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32") - out = topi.nn.relu(conv + bias) - return [data, kernel, bias, out] - - -###################################################################### -# Create the search task -# ^^^^^^^^^^^^^^^^^^^^^^ -# We then create a search task for the last convolution layer in the resnet. - -target = tvm.target.Target("cuda") - -# Use the last layer in ResNet-50 -N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) -task = auto_scheduler.SearchTask( - func=conv2d_layer, args=(N, H, W, CO, CI, KH, KW, strides, padding), target=target -) - -# Inspect the computational graph -print("Computational DAG:") -print(task.compute_dag) - -###################################################################### -# Next, we set parameters for the auto-scheduler. These parameters -# mainly specify how we do the measurement during the search. -# -# * :code:`measure_ctx` launches a different process for measurement to -# provide isolation. It can protect the main process from GPU crashes -# during measurement and avoid other runtime conflicts. -# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. -# This can warmup the GPU, which is necessary to get accurate measurement results. -# Typically, we recommend a value >= 300 ms. -# * :code:`num_measure_trials` is the number of measurement trials we can use during the search. -# We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a -# good value for the search to converge. You can do more trials according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `conv2d.json`. -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. -# * see :any:`auto_scheduler.TuningOptions`, -# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. - -log_file = "conv2d.json" -measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) -tune_option = auto_scheduler.TuningOptions( - num_measure_trials=10, # change this to 1000 to achieve the best performance - runner=measure_ctx.runner, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - verbose=2, -) - -###################################################################### -# Run the search -# ^^^^^^^^^^^^^^ -# Now we get all inputs ready. Pretty simple, isn't it? -# We can kick off the search and let the auto-scheduler do its magic. -# After some measurement trials, we can load the best schedule from the log -# file and apply it. - -# Run auto-tuning (search) -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. -# task.tune(tune_option) -# Apply the best schedule -sch, args = task.apply_best(log_file) - -# Kill the measurement process -del measure_ctx - -###################################################################### -# We can lower the schedule to see the IR after auto-scheduling. -# The auto-scheduler correctly performs optimizations including multi-level tiling, -# cooperative fetching, unrolling and operator fusion. - -print("Lowered TIR:") -print(tvm.lower(sch, args, simple_mode=True)) - -###################################################################### -# Check correctness and evaluate performance -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# We build the binary and check its correctness and performance. - -func = tvm.build(sch, args, target) - -# Check correctness -data_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) -weight_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) -bias_np = np.random.uniform(size=(1, CO, 1, 1)).astype(np.float32) -conv_np = conv2d_nchw_python(data_np, weight_np, strides, padding) -out_np = np.maximum(conv_np + bias_np, 0.0) - -dev = tvm.cuda() -data_tvm = tvm.nd.array(data_np, device=dev) -weight_tvm = tvm.nd.array(weight_np, device=dev) -bias_tvm = tvm.nd.array(bias_np, device=dev) -out_tvm = tvm.nd.empty(out_np.shape, device=dev) -func(data_tvm, weight_tvm, bias_tvm, out_tvm) - -# Check results -np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3) - -# Evaluate execution time -evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) -print( - "Execution time of this operator: %.3f ms" - % (np.median(evaluator(data_tvm, weight_tvm, bias_tvm, out_tvm).results) * 1000) -) - -###################################################################### -# Using the record file -# ^^^^^^^^^^^^^^^^^^^^^ -# During the search, all measurement records are dumped into the record -# file "conv2d.json". The measurement records can be used to re-apply search results, -# resume the search, and perform other analyses. - -###################################################################### -# Here is an example where we load the best schedule from a file, -# print the equivalent python schedule API and CUDA source code. -# They can be used for debugging and learning the behavior of the auto-scheduler. - -print("Equivalent python schedule:") -print(task.print_best(log_file, print_mode="schedule")) - -print("CUDA source code:") -print(task.print_best(log_file, print_mode="cuda")) - -###################################################################### -# A more complicated example is to resume the search. -# In this case, we need to create the search policy and cost model by ourselves -# and resume the status of search policy and cost model with the log file. -# In the example below we resume the status and do more 5 trials. - - -def resume_search(task, log_file): - print("Resume search:") - cost_model = auto_scheduler.XGBModel() - cost_model.update_from_file(log_file) - search_policy = auto_scheduler.SketchPolicy( - task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)] - ) - measure_ctx = auto_scheduler.LocalRPCMeasureContext(min_repeat_ms=300) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=5, - runner=measure_ctx.runner, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - ) - task.tune(tune_option, search_policy=search_policy) - - # Kill the measurement process - del measure_ctx - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. -# resume_search(task, log_file) diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py deleted file mode 100644 index e4edf0333508..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py +++ /dev/null @@ -1,427 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-scheduling a Neural Network for ARM CPU -============================================= -**Author**: `Thierry Moreau `_, \ - `Lianmin Zheng `_, \ - `Chengfan Jia `_ - -Auto-tuning for specific devices and workloads is critical for getting the -best performance. This is a tutorial on how to tune a whole neural -network for ARM CPU with the auto-scheduler via RPC. - -To auto-tune a neural network, we partition the network into small subgraphs and -tune them independently. Each subgraph is treated as one search task. -A task scheduler slices the time and dynamically allocates time resources to -these tasks. The task scheduler predicts the impact of each task on the end-to-end -execution time and prioritizes the one that can reduce the execution time the most. - -For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to -get the computational DAG in the tensor expression form. -We then use the auto-scheduler to construct a search space of this DAG and search -for good schedules (low-level optimizations). - -Different from the template-based :ref:`autotvm ` which relies on -manual templates to define the search space, the auto-scheduler does not require any -schedule templates. In other words, the auto-scheduler only uses the compute declarations -in :code:`tvm/python/topi` and does not use existing schedule templates. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - - -import numpy as np -import os -import sys - -import tvm -from tvm import relay, auto_scheduler -from tvm.relay import data_dep_optimization as ddo -import tvm.relay.testing -from tvm.contrib import graph_executor -from tvm.contrib.utils import tempdir - -################################################################# -# Define a Network -# ---------------- -# First, we need to define the network with relay frontend API. -# We can load some pre-defined network from :code:`tvm.relay.testing`. -# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow -# (see :ref:`front end tutorials`). -# -# For convolutional neural networks, although auto-scheduler can work correctly -# with any layout, we found the best performance is typically achieved with NHWC layout. -# We also implemented more optimizations for NHWC layout with the auto-scheduler. -# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. - - -def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=False): - """Get the symbol definition and random weight of a network""" - - # auto-scheduler prefers NHWC layout - if layout == "NHWC": - image_shape = (224, 224, 3) - elif layout == "NCHW": - image_shape = (3, 224, 224) - else: - raise ValueError("Invalid layout: " + layout) - - input_shape = (batch_size,) + image_shape - output_shape = (batch_size, 1000) - - if name.startswith("resnet-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name.startswith("resnet3d-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload( - batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape - ) - elif name == "squeezenet_v1.1": - assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" - mod, params = relay.testing.squeezenet.get_workload( - version="1.1", - batch_size=batch_size, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - elif name == "mlp": - mod, params = relay.testing.mlp.get_workload( - batch_size=batch_size, dtype=dtype, image_shape=image_shape, num_classes=1000 - ) - else: - raise ValueError("Network not found.") - - if use_sparse: - from tvm.topi.sparse.utils import convert_model_dense_to_sparse - - mod, params = convert_model_dense_to_sparse(mod, params, random_params=True) - - return mod, params, input_shape, output_shape - - -################################################################# -# Start RPC Tracker -# ----------------- -# TVM uses RPC session to communicate with ARM boards. -# During tuning, the tuner will send the generated code to the board and -# measure the speed of code on the board. -# -# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices. -# The RPC Tracker is a centralized controller node. We can register all devices to -# the tracker. For example, if we have 10 phones, we can register all of them -# to the tracker, and run 10 measurements in parallel, accelerating the tuning process. -# -# To start an RPC tracker, run this command on the host machine. The tracker is -# required during the whole tuning process, so we need to open a new terminal for -# this command: -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190 -# -# The expected output is -# -# .. code-block:: bash -# -# INFO:RPCTracker:bind to 0.0.0.0:9190 - -################################################################# -# Register Devices to RPC Tracker -# ----------------------------------- -# Now we can register our devices to the tracker. The first step is to -# build the TVM runtime for the ARM devices. -# -# * For Linux: -# Follow this section :ref:`build-tvm-runtime-on-device` to build -# the TVM runtime on the device. Then register the device to tracker by -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rasp4b-64 -# -# (replace :code:`[HOST_IP]` with the IP address of your host machine) -# -# * For Android: -# Follow this `readme page `_ to -# install the TVM RPC APK on the android device. Make sure you can pass the android rpc test. -# Then you have already registered your device. During tuning, you have to go to developer option -# and enable "Keep screen awake during changing" and charge your phone to make it stable. -# -# After registering devices, we can confirm it by querying rpc_tracker -# -# .. code-block:: bash -# -# python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 -# -# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 4B with 64bit OS, and 2 rk3399, -# the output can be -# -# .. code-block:: bash -# -# Queue Status -# ---------------------------------- -# key total free pending -# ---------------------------------- -# mate10pro 2 2 0 -# rk3399 2 2 0 -# rasp4b-64 11 11 0 -# ---------------------------------- -# -# You can register multiple devices to the tracker to accelerate the measurement in tuning. - -########################################### -# Set Tuning Options -# ------------------ -# Before tuning, we should apply some configurations. Here I use a Raspberry Pi 4b 4GB board -# as example with a 64bit OS (Ubuntu 20.04). In your setting, you should modify the target -# and device_key accordingly. -# set :code:`use_ndk` to True if you use android phone. - -#### DEVICE CONFIG #### - -# Replace "aarch64-linux-gnu" with the correct target of your board. -# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device. -# FIXME(tmoreau89, merrymercy): We leave '-device=arm_cpu' out of the target string -# because we're sharing x86 op strategy. -target = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+neon") - -# Also replace this with the device key, rpc host and rpc port in your tracker -device_key = "rasp4b-64" -rpc_host = "127.0.0.1" -rpc_port = 9190 - -# Set this to True if you use ndk tools for cross compiling -# And also set the environment variable below to point to the cross compiler -use_ndk = False -# os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++" - -#### TUNING OPTION #### -network = "mobilenet" -use_sparse = False -batch_size = 1 -layout = "NHWC" -dtype = "float32" -log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) - -################################################################# -# Extract Search Tasks -# -------------------- -# Next, we extract the search tasks and their weights from a network. -# The weight of a task is the number of appearances of the task's subgraph -# in the whole network. -# By using the weight, we can approximate the end-to-end latency of the network -# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the -# latency of a task and :code:`weight[t]` is the weight of the task. -# The task scheduler will just optimize this objective. - -# Extract tasks from the network -print("Get model...") -mod, params, input_shape, output_shape = get_network( - network, batch_size, layout, dtype=dtype, use_sparse=use_sparse -) -print("Extract tasks...") -tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) - -for idx, task in enumerate(tasks): - print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) - print(task.compute_dag) - - -################################################################# -# Tuning and Evaluation -# --------------------- -# Now, we set some options for tuning and launch the search tasks -# -# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. -# You can set it to a small number (e.g., 200) for a fast demonstrative run. -# In practice, we recommend setting it around :code:`800 * len(tasks)`, -# which is typically enough for the search to converge. -# For example, there are 29 tasks in resnet-50, so we can set it as 20000. -# You can adjust this parameter according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. -# * see :any:`auto_scheduler.TuningOptions`, -# :any:`auto_scheduler.LocalRunner` for more parameters. -# -# After auto-tuning, we can compile the network with the best schedules we found. -# All measurement records are dumped into the log file during auto-tuning, -# so we can read the log file and load the best schedules. - - -def tune_and_evaluate(): - print("Begin tuning...") - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=200, # change this to 20000 to achieve the best performance - builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"), - runner=auto_scheduler.RPCRunner( - device_key, - host=rpc_host, - port=rpc_port, - timeout=30, - repeat=1, - min_repeat_ms=200, - enable_cpu_cache_flush=True, - ), - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - ) - - tuner.tune(tune_option) - - # Compile with the history best - print("Compile...") - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, config={"relay.backend.use_auto_scheduler": True} - ): - lib = relay.build(mod, target=target, params=params) - - # Export library - tmp = tempdir() - if use_ndk: - from tvm.contrib import ndk - - filename = "net.so" - lib.export_library(tmp.relpath(filename), fcompile=ndk.create_shared) - else: - filename = "net.tar" - lib.export_library(tmp.relpath(filename)) - - # Upload module to device - print("Upload...") - remote = auto_scheduler.utils.request_remote(device_key, rpc_host, rpc_port, timeout=10000) - remote.upload(tmp.relpath(filename)) - rlib = remote.load_module(filename) - - # Create graph executor - dev = remote.cpu() - module = graph_executor.GraphModule(rlib["default"](dev)) - data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) - module.set_input("data", data_tvm) - - # Evaluate - print("Evaluate inference time cost...") - print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) - - -# We do not run the tuning in our webpage server since the server doesn't have a Raspberry Pi, -# or device tracker running. -# Uncomment the following line to run it by yourself. - -# tune_and_evaluate() - - -###################################################################### -# .. note:: Explaining the printed information during tuning -# -# During the tuning, a lot of information will be printed on the console. -# They are used for debugging purposes. The most important info is the output -# of the task scheduler. The following table is a sample output. -# -# .. code-block:: c -# -# ---------------------------------------------------------------------- -# ------------------------------ [ Task Scheduler ] -# ---------------------------------------------------------------------- -# | ID | Latency (ms) | Speed (GFLOPS) | Trials | -# ------------------------------------------------- -# | 0 | 0.013 | 0.31 | 64 | -# | 1 | 0.845 | 2.43 | 448 | -# | 2 | 0.046 | -0.00 | 64 | -# | 3 | 4.194 | 24.53 | 2112 | -# | 4 | 0.109 | 9.21 | 64 | -# | 5 | 1.759 | 29.27 | 896 | -# | 6 | 0.083 | 6.01 | 64 | -# | 7 | 3.084 | 33.38 | 7680 | -# | 8 | 0.136 | 14.78 | 384 | -# | 9 | 1.349 | 38.23 | 768 | -# | 10 | 0.133 | 7.55 | 128 | -# | 11 | 2.747 | 37.56 | 1536 | -# | 12 | 0.338 | 11.87 | 192 | -# | 13 | 1.295 | 40.00 | 704 | -# | 14 | 0.482 | 4.16 | 256 | -# | 15 | 2.686 | 38.56 | 1344 | -# | 16 | 0.884 | 9.08 | 448 | -# | 17 | 1.332 | 39.18 | 704 | -# | 18 | 1.045 | 3.84 | 576 | -# | 19 | 1.391 | 38.09 | 704 | -# | 20 | 0.777 | 10.34 | 448 | -# | 21 | 0.739 | 30.97 | 448 | -# ------------------------------------------------- -# Estimated total latency: 38.347 ms Trials: 19992 Used time : 19260 s Next ID: 3 -# -# This table lists the latency and (estimated) speed of all tasks. -# It also lists the allocation of measurement trials for all tasks. -# The last line prints the total weighted latency of these tasks, -# which can be a rough estimation of the end-to-end execution time -# of the network. -# The last line also prints the total number of measurement trials, -# total time spent on auto-tuning and the id of the next task to tune. -# -# There will also be some "dmlc::Error"s errors, because the -# auto-scheduler will try some invalid schedules. -# You can safely ignore them if the tuning can continue, because these -# errors are isolated from the main process. -# - -###################################################################### -# .. note:: Terminate the tuning earlier -# -# You can terminate the tuning earlier by forcibly killing this process. -# As long as you get at least one valid schedule for each task in the log file, -# you should be able to do the compilation (the secion below). -# - -################################################################# -# Other Tips -# ---------- -# 1. During the tuning, the auto-scheduler needs to compile many programs and -# extract feature from them. This part is CPU-intensive, -# so a high-performance CPU with many cores is recommended for faster search. -# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` -# to distill the large log file and only save the best useful records. -# 3. You can resume a search from the previous log file. You just need to -# add a new argument :code:`load_log_file` when creating the task scheduler -# in function :code:`run_tuning`. Say, -# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` -# 4. If you have multiple target CPUs, you can use all of them for measurements to -# parallelize the measurements. Check this :ref:`section ` -# to learn how to use the RPC Tracker and RPC Server. -# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` -# with :any:`auto_scheduler.RPCRunner`. diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py deleted file mode 100644 index f11aef253f81..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py +++ /dev/null @@ -1,296 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-scheduling a Neural Network for NVIDIA GPU -=============================================== -**Author**: `Lianmin Zheng `_ - -Auto-tuning for specific devices and workloads is critical for getting the -best performance. This is a tutorial on how to tune a whole neural -network for NVIDIA GPU with the auto-scheduler. - -To auto-tune a neural network, we partition the network into small subgraphs and -tune them independently. Each subgraph is treated as one search task. -A task scheduler slices the time and dynamically allocates time resources to -these tasks. The task scheduler predicts the impact of each task on the end-to-end -execution time and prioritizes the one that can reduce the execution time the most. - -For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to -get the computational DAG in the tensor expression form. -We then use the auto-scheduler to construct a search space of this DAG and search -for good schedules (low-level optimizations). - -Different from the template-based :ref:`autotvm ` which relies on -manual templates to define the search space, the auto-scheduler does not require any -schedule templates. In other words, the auto-scheduler only uses the compute declarations -in :code:`tvm/python/topi` and does not use existing schedule templates. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -import sys -import numpy as np - -import tvm -from tvm import relay, auto_scheduler -import tvm.relay.testing -from tvm.contrib import graph_executor - -################################################################# -# Define a Network -# ---------------- -# First, we need to define the network with relay frontend API. -# We can load some pre-defined network from :code:`tvm.relay.testing`. -# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow -# (see :ref:`front end tutorials`). -# -# For convolutional neural networks, although auto-scheduler can work correctly -# with any layout, we found the best performance is typically achieved with NHWC layout. -# We also implemented more optimizations for NHWC layout with the auto-scheduler. -# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. - - -def get_network(name, batch_size, layout="NHWC", dtype="float32"): - """Get the symbol definition and random weight of a network""" - - # auto-scheduler prefers NHWC layout - if layout == "NHWC": - image_shape = (224, 224, 3) - elif layout == "NCHW": - image_shape = (3, 224, 224) - else: - raise ValueError("Invalid layout: " + layout) - - input_shape = (batch_size,) + image_shape - output_shape = (batch_size, 1000) - - if name.startswith("resnet-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name.startswith("resnet3d-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload( - batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape - ) - elif name == "squeezenet_v1.1": - assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" - mod, params = relay.testing.squeezenet.get_workload( - version="1.1", - batch_size=batch_size, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - return mod, params, input_shape, output_shape - - -# Define the neural network and compilation target -network = "resnet-18" -batch_size = 1 -layout = "NHWC" -target = tvm.target.Target("cuda") -dtype = "float32" -log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) - -################################################################# -# Extract Search Tasks -# -------------------- -# Next, we extract the search tasks and their weights from a network. -# The weight of a task is the number of appearances of the task's subgraph -# in the whole network. -# By using the weight, we can approximate the end-to-end latency of the network -# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the -# latency of a task and :code:`weight[t]` is the weight of the task. -# The task scheduler will just optimize this objective. - -# Extract tasks from the network -print("Extract tasks...") -mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) -tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) - -for idx, task in enumerate(tasks): - print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) - print(task.compute_dag) - -################################################################# -# Begin Tuning -# ------------ -# Now, we set some options for tuning and launch the search tasks -# -# * :code:`measure_ctx` launches a different process for measurement to -# provide isolation. It can protect the main process from GPU crashes -# during measurement and avoid other runtime conflicts. -# * :code:`min_repeat_ms` defines the minimum duration of one "repeat" in every measurement. -# This can warmup the GPU, which is necessary to get accurate measurement results. -# Typically, we recommend a value >= 300 ms. -# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. -# You can set it to a small number (e.g., 200) for a fast demonstrative run. -# In practice, we recommend setting it around :code:`900 * len(tasks)`, -# which is typically enough for the search to converge. -# For example, there are 24 tasks in resnet-18, so we can set it as 20000. -# You can adjust this parameter according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. -# * see :any:`auto_scheduler.TuningOptions`, -# :any:`auto_scheduler.LocalRPCMeasureContext` for more parameters. -# - - -def run_tuning(): - print("Begin tuning...") - measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10) - - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=200, # change this to 20000 to achieve the best performance - runner=measure_ctx.runner, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - ) - - tuner.tune(tune_option) - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. - -# run_tuning() - - -###################################################################### -# .. note:: Explain the printed information during tuning -# -# During the tuning, a lot of information will be printed on the console. -# They are used for debugging purposes. The most important info is the output -# of the task scheduler. The following table is a sample output. -# -# .. code-block:: c -# -# ---------------------------------------------------------------------- -# ------------------------------ [ Task Scheduler ] -# ---------------------------------------------------------------------- -# | ID | Latency (ms) | Speed (GFLOPS) | Trials | -# ------------------------------------------------- -# | 0 | 0.005 | 0.88 | 64 | -# | 1 | 0.010 | 99.10 | 64 | -# | 2 | 0.006 | 0.00 | 64 | -# | 3 | 0.145 | 979.78 | 384 | -# | 4 | 0.130 | 1097.02 | 384 | -# | 5 | 0.143 | 992.69 | 384 | -# | 6 | 0.076 | 1526.86 | 192 | -# | 7 | 0.115 | 999.44 | 320 | -# | 8 | 0.079 | 1449.39 | 320 | -# | 9 | 0.122 | 938.73 | 384 | -# | 10 | 0.063 | 1832.98 | 192 | -# | 11 | 0.072 | 1763.62 | 256 | -# | 12 | 0.062 | 2036.40 | 192 | -# | 13 | 0.068 | 1874.44 | 192 | -# | 14 | 0.049 | 2346.50 | 128 | -# | 15 | 0.076 | 1694.31 | 256 | -# | 16 | 0.067 | 1933.30 | 448 | -# | 17 | 0.076 | 1680.90 | 256 | -# | 18 | 0.022 | 98.43 | 64 | -# | 19 | 0.076 | 3112.55 | 192 | -# | 20 | 0.013 | 2026.44 | 64 | -# | 21 | 0.011 | 1136.69 | 64 | -# | 22 | 0.013 | 992.47 | 64 | -# | 23 | 0.020 | 627.56 | 64 | -# ------------------------------------------------- -# Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3 -# -# This table lists the latency and (estimated) speed of all tasks. -# It also lists the allocation of measurement trials for all tasks. -# The last line prints the total weighted latency of these tasks, -# which can be a rough estimation of the end-to-end execution time -# of the network. -# The last line also prints the total number of measurement trials, -# total time spent on auto-tuning and the id of the next task to tune. -# -# There will also be some "tvm::Error"s and CUDA errors, because the -# auto-scheduler will try some invalid schedules. -# You can safely ignore them if the tuning can continue, because these -# errors are isolated from the main process. -# - -###################################################################### -# .. note:: Terminate the tuning earlier -# -# You can terminate the tuning earlier by forcibly killing this process. -# As long as you get at least one valid schedule for each task in the log file, -# you should be able to do the compilation (the secion below). -# - - -################################################################# -# Compile and Evaluate -# -------------------- -# After auto-tuning, we can compile the network with the best schedules we found. -# All measurement records are dumped into the log file during auto-tuning, -# so we can read the log file and load the best schedules. - -# Compile with the history best -print("Compile...") -with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): - lib = relay.build(mod, target=target, params=params) - -# Create graph executor -dev = tvm.device(str(target), 0) -module = graph_executor.GraphModule(lib["default"](dev)) -data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) -module.set_input("data", data_tvm) - -# Evaluate -print("Evaluate inference time cost...") -print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) - - -################################################################# -# Other Tips -# ---------- -# 1. During the tuning, the auto-scheduler needs to compile many programs and -# extract feature from them. This part is CPU-intensive, -# so a high-performance CPU with many cores is recommended for faster search. -# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` -# to distill the large log file and only save the best useful records. -# 3. You can resume a search from the previous log file. You just need to -# add a new argument :code:`load_log_file` when creating the task scheduler -# in function :code:`run_tuning`. Say, -# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` -# 4. If you have multiple target GPUs, you can use all of them for measurements to -# parallelize the measurements. Check this :ref:`section ` -# to learn how to use the RPC Tracker and RPC Server. -# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` -# with :any:`auto_scheduler.RPCRunner`. diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py b/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py deleted file mode 100644 index 3120c30cef1a..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py +++ /dev/null @@ -1,348 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-scheduling a Neural Network for mali GPU -============================================= -**Author**: `Zhao Wu `_ - -Auto-tuning for specific devices and workloads is critical for getting the -best performance. This is a tutorial on how to tune a whole neural -network for mali GPU with the auto-scheduler. - -To auto-tune a neural network, we partition the network into small subgraphs and -tune them independently. Each subgraph is treated as one search task. -A task scheduler slices the time and dynamically allocates time resources to -these tasks. The task scheduler predicts the impact of each task on the end-to-end -execution time and prioritizes the one that can reduce the execution time the most. - -For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to -get the computational DAG in the tensor expression form. -We then use the auto-scheduler to construct a search space of this DAG and search -for good schedules (low-level optimizations). - -Different from the template-based :ref:`autotvm ` which relies on -manual templates to define the search space, the auto-scheduler does not require any -schedule templates. In other words, the auto-scheduler only uses the compute declarations -in :code:`tvm/python/topi` and does not use existing schedule templates. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -import os -import sys - -import numpy as np - -import tvm -from tvm import relay, auto_scheduler -import tvm.relay.testing -from tvm.contrib import graph_executor - - -################################################################# -# Define a Network -# ---------------- -# First, we need to define the network with relay frontend API. -# We can load some pre-defined network from :code:`tvm.relay.testing`. -# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow -# (see :ref:`front end tutorials`). -# -# For convolutional neural networks, although auto-scheduler can work correctly -# with any layout, we found the best performance is typically achieved with NHWC layout. -# We also implemented more optimizations for NHWC layout with the auto-scheduler. -# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. - - -def get_network(name, batch_size, layout="NHWC", dtype="float32"): - """Get the symbol definition and random weight of a network""" - - # auto-scheduler prefers NHWC layout - if layout == "NHWC": - image_shape = (224, 224, 3) - elif layout == "NCHW": - image_shape = (3, 224, 224) - else: - raise ValueError("Invalid layout: " + layout) - - input_shape = (batch_size,) + image_shape - output_shape = (batch_size, 1000) - - if name.startswith("resnet-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name.startswith("resnet3d-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload( - batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape - ) - elif name == "squeezenet_v1.1": - assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" - mod, params = relay.testing.squeezenet.get_workload( - version="1.1", - batch_size=batch_size, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - - return mod, params, input_shape, output_shape - - -# Define the neural network and compilation target. -network = "mobilenet" -batch_size = 1 -layout = "NHWC" -# Set this to True if you use ndk tools for cross compiling -use_ndk = True -# Path to cross compiler -os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++" -target = tvm.target.Target("opencl -device=mali", host="llvm -mtriple=aarch64-linux-gnu") -dtype = "float32" -log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) - - -################################################################# -# Start an RPC Tracker and Register Devices to the Tracker -# -------------------------------------------------------- -# Please refer to the "Start RPC Tracker" and "Register Devices to RPC Tracker" setions -# in this :ref:`tutorial ` to start an RPC tracker -# and register devices to the tracker. - -# Replace this with the device key in your tracker -device_key = "rk3399" - - -################################################################# -# Extract Search Tasks -# -------------------- -# Next, we extract the search tasks and their weights from a network. -# The weight of a task is the number of appearances of the task's subgraph -# in the whole network. -# By using the weight, we can approximate the end-to-end latency of the network -# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the -# latency of a task and :code:`weight[t]` is the weight of the task. -# The task scheduler will just optimize this objective. - -# Extract tasks from the network -print("Extract tasks...") -mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) -tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) - -for idx, task in enumerate(tasks): - print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) - print(task.compute_dag) -###################################################################### -# .. note:: How to get the hardware parameters from remote device -# -# .. code-block:: python -# -# from tvm.auto_scheduler.utils import request_remote -# remote = request_remote(device_key, "127.0.0.1", 9190) -# dev = remote.cl() -# max_shared_memory_per_block = dev.max_shared_memory_per_block -# # There is no explicit local memory limition -# # so we can use INT32_MAX to disable the check on local_memory. -# max_local_memory_per_block = 2147483647 # INT32_MAX -# max_threads_per_block = dev.max_threads_per_block -# max_vthread_extent = int(dev.warp_size / 4) if int(dev.warp_size / 4) > 1 else dev.warp_size -# warp_size = dev.warp_size -# hardware_params = auto_scheduler.HardwareParams(-1, 16, 64, -# max_shared_memory_per_block, max_local_memory_per_block, -# max_threads_per_block, max_vthread_extent, warp_size) -# -# Now you could pass it to search task and tune -# -# .. code-block:: python -# -# tasks, task_weights = auto_scheduler.extract_tasks( -# mod["main"], params, target, hardware_params = hardware_params -# ) -# - -################################################################# -# Tuning and Evaluate -# ------------------- -# Now, we set some options for tuning, launch the search tasks and evaluate the end-to-end performance -# -# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. -# You can set it to a small number (e.g., 200) for a fast demonstrative run. -# In practice, we recommend setting it around :code:`800 * len(tasks)`, -# which is typically enough for the search to converge. -# For example, there are 29 tasks in resnet-50, so we can set it as 20000. -# You can adjust this parameter according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. -# * see :any:`auto_scheduler.TuningOptions`, -# :any:`auto_scheduler.LocalRunner` for more parameters. -# - - -def tune_and_evaluate(): - print("Begin tuning...") - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=200, # change this to 20000 to achieve the best performance - builder=auto_scheduler.LocalBuilder(build_func="ndk" if use_ndk else "default"), - runner=auto_scheduler.RPCRunner( - device_key, host="127.0.0.1", port=9190, repeat=3, timeout=50 - ), - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - ) - - tuner.tune(tune_option) - - # Compile the whole network - print("Compile...") - with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext( - opt_level=3, config={"relay.backend.use_auto_scheduler": True} - ): - lib = relay.build(mod, target, params=params) - - # Create graph executor - print("=============== Request Remote ===============") - from tvm.auto_scheduler.utils import request_remote - - remote = request_remote(device_key, "127.0.0.1", 9190) - dev = remote.cl() - from tvm.contrib import utils, ndk - - temp = utils.tempdir() - filename = "deploy_lib.so" - path_lib = temp.relpath(filename) - lib.export_library(path_lib, fcompile=ndk.create_shared) - remote.upload(path_lib) - loaded_lib = remote.load_module(filename) - module = graph_executor.GraphModule(loaded_lib["default"](dev)) - data = (np.random.uniform(size=input_shape)).astype(dtype) - data_tvm = tvm.nd.array(data) - module.set_input("data", data_tvm) - - # Evaluate - print("Evaluate inference time cost...") - print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) - - -# We do not run the tuning in our webpage server since server doesn't have mali gpu. -# Uncomment the following line to run it by yourself. - -# tune_and_evaluate() - -###################################################################### -# .. note:: Explain the printed information during tuning -# -# During the tuning, a lot of information will be printed on the console. -# They are used for debugging purposes. The most important info is the output -# of the task scheduler. The following table is a sample output. -# -# .. code-block:: c -# -# ---------------------------------------------------------------------- -# ------------------------------ [ Task Scheduler ] -# ---------------------------------------------------------------------- -# | ID | Latency (ms) | Speed (GFLOPS) | Trials | -# ------------------------------------------------- -# | 0 | 0.010 | 0.40 | 64 | -# | 1 | 0.087 | 47.19 | 64 | -# | 2 | 0.008 | -0.00 | 64 | -# | 3 | 0.177 | 582.07 | 64 | -# | 4 | 0.268 | 862.37 | 256 | -# | 5 | 0.166 | 621.13 | 128 | -# | 6 | 0.170 | 605.10 | 128 | -# | 7 | 0.128 | 403.20 | 64 | -# | 8 | 0.189 | 545.71 | 64 | -# | 9 | 0.231 | 1001.01 | 448 | -# | 10 | 0.155 | 664.80 | 256 | -# | 11 | 0.155 | 662.86 | 256 | -# | 12 | 0.119 | 434.08 | 64 | -# | 13 | 0.199 | 522.13 | 64 | -# | 14 | 0.235 | 986.56 | 320 | -# | 15 | 0.149 | 689.13 | 128 | -# | 16 | 0.155 | 664.80 | 192 | -# | 17 | 0.151 | 340.64 | 64 | -# | 18 | 0.176 | 597.55 | 128 | -# | 19 | 0.220 | 1054.37 | 192 | -# | 20 | 0.150 | 686.01 | 128 | -# | 21 | 0.159 | 650.88 | 128 | -# | 22 | 0.073 | 358.19 | 64 | -# | 23 | 0.031 | 70.63 | 64 | -# | 24 | 0.251 | 947.73 | 128 | -# | 25 | 0.157 | 652.47 | 128 | -# | 26 | 0.215 | 954.84 | 128 | -# | 27 | 0.237 | 868.92 | 128 | -# | 28 | 0.266 | 774.06 | 128 | -# ------------------------------------------------- -# Estimated total latency: 10.016 ms Trials: 3992 Used time : 1131 s Next ID: 15 -# -# This table lists the latency and (estimated) speed of all tasks. -# It also lists the allocation of measurement trials for all tasks. -# The last line prints the total weighted latency of these tasks, -# which can be a rough estimation of the end-to-end execution time -# of the network. -# The last line also prints the total number of measurement trials, -# total time spent on auto-tuning and the id of the next task to tune. -# -# There will also be some "tvm::Error"s errors, because the -# auto-scheduler will try some invalid schedules. -# You can safely ignore them if the tuning can continue, because these -# errors are isolated from the main process. -# - -###################################################################### -# .. note:: Terminate the tuning earlier -# -# You can terminate the tuning earlier by forcibly killing this process. -# As long as you get at least one valid schedule for each task in the log file, -# you should be able to do the compilation (the secion below). -# - -################################################################# -# Other Tips -# ---------- -# 1. During the tuning, the auto-scheduler needs to compile many programs and -# extract feature from them. This part is CPU-intensive, -# so a high-performance CPU with many cores is recommended for faster search. -# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` -# to distill the large log file and only save the best useful records. -# 3. You can resume a search from the previous log file. You just need to -# add a new argument :code:`load_log_file` when creating the task scheduler -# in function :code:`run_tuning`. Say, -# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` -# 4. If you have multiple target GPUs, you can use all of them for measurements to -# parallelize the measurements. Check this :ref:`section ` -# to learn how to use the RPC Tracker and RPC Server. -# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` -# with :any:`auto_scheduler.RPCRunner`. diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py deleted file mode 100644 index 43314a4b0a2f..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py +++ /dev/null @@ -1,333 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-scheduling a Neural Network for x86 CPU -============================================ -**Author**: `Lianmin Zheng `_, \ - `Chengfan Jia `_ - -Auto-tuning for specific devices and workloads is critical for getting the -best performance. This is a tutorial on how to tune a whole neural -network for x86 CPU with the auto-scheduler. - -To auto-tune a neural network, we partition the network into small subgraphs and -tune them independently. Each subgraph is treated as one search task. -A task scheduler slices the time and dynamically allocates time resources to -these tasks. The task scheduler predicts the impact of each task on the end-to-end -execution time and prioritizes the one that can reduce the execution time the most. - -For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to -get the computational DAG in the tensor expression form. -We then use the auto-scheduler to construct a search space of this DAG and search -for good schedules (low-level optimizations). - -Different from the template-based :ref:`autotvm ` which relies on -manual templates to define the search space, the auto-scheduler does not require any -schedule templates. In other words, the auto-scheduler only uses the compute declarations -in :code:`tvm/python/topi` and does not use existing schedule templates. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -import sys - -import numpy as np - -import tvm -from tvm import relay, auto_scheduler -from tvm.relay import data_dep_optimization as ddo -import tvm.relay.testing -from tvm.contrib import graph_executor - -################################################################# -# Define a Network -# ---------------- -# First, we need to define the network with relay frontend API. -# We can load some pre-defined network from :code:`tvm.relay.testing`. -# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow -# (see :ref:`front end tutorials`). -# -# For convolutional neural networks, although auto-scheduler can work correctly -# with any layout, we found the best performance is typically achieved with NHWC layout. -# We also implemented more optimizations for NHWC layout with the auto-scheduler. -# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. - - -def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=False): - """Get the symbol definition and random weight of a network""" - - # auto-scheduler prefers NHWC layout - if layout == "NHWC": - image_shape = (224, 224, 3) - elif layout == "NCHW": - image_shape = (3, 224, 224) - else: - raise ValueError("Invalid layout: " + layout) - - input_shape = (batch_size,) + image_shape - output_shape = (batch_size, 1000) - - if name.startswith("resnet-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name.startswith("resnet3d-"): - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, - batch_size=batch_size, - layout=layout, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload( - batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape - ) - elif name == "squeezenet_v1.1": - assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" - mod, params = relay.testing.squeezenet.get_workload( - version="1.1", - batch_size=batch_size, - dtype=dtype, - image_shape=image_shape, - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - elif name == "mlp": - mod, params = relay.testing.mlp.get_workload( - batch_size=batch_size, dtype=dtype, image_shape=image_shape, num_classes=1000 - ) - else: - raise ValueError("Network not found.") - - if use_sparse: - from tvm.topi.sparse.utils import convert_model_dense_to_sparse - - mod, params = convert_model_dense_to_sparse(mod, params, bs_r=4, random_params=True) - - return mod, params, input_shape, output_shape - - -# Define the neural network and compilation target. -# If the target machine supports avx512 instructions, replace the -# "llvm -mcpu=core-avx2" with "llvm -mcpu=skylake-avx512" -network = "resnet-50" -use_sparse = False -batch_size = 1 -layout = "NHWC" -target = tvm.target.Target("llvm -mcpu=core-avx2") -dtype = "float32" -log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) - -################################################################# -# Extract Search Tasks -# -------------------- -# Next, we extract the search tasks and their weights from a network. -# The weight of a task is the number of appearances of the task's subgraph -# in the whole network. -# By using the weight, we can approximate the end-to-end latency of the network -# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the -# latency of a task and :code:`weight[t]` is the weight of the task. -# The task scheduler will just optimize this objective. - -# Extract tasks from the network -print("Get model...") -mod, params, input_shape, output_shape = get_network( - network, - batch_size, - layout, - dtype=dtype, - use_sparse=use_sparse, -) - -print("Extract tasks...") -tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) - -for idx, task in enumerate(tasks): - print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) - print(task.compute_dag) - -################################################################# -# Begin Tuning -# ------------ -# Now, we set some options for tuning and launch the search tasks -# -# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. -# You can set it to a small number (e.g., 200) for a fast demonstrative run. -# In practice, we recommend setting it around :code:`800 * len(tasks)`, -# which is typically enough for the search to converge. -# For example, there are 29 tasks in resnet-50, so we can set it as 20000. -# You can adjust this parameter according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. -# * see :any:`auto_scheduler.TuningOptions`, -# :any:`auto_scheduler.LocalRunner` for more parameters. -# - - -def run_tuning(): - print("Begin tuning...") - tuner = auto_scheduler.TaskScheduler(tasks, task_weights) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=200, # change this to 20000 to achieve the best performance - runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True), - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - ) - - if use_sparse: - from tvm.topi.sparse.utils import sparse_sketch_rules - - search_policy = [ - auto_scheduler.SketchPolicy( - task, - program_cost_model=auto_scheduler.XGBModel(), - init_search_callbacks=sparse_sketch_rules(), - ) - for task in tasks - ] - - tuner.tune(tune_option, search_policy=search_policy) - else: - tuner.tune(tune_option) - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. - -# run_tuning() - - -###################################################################### -# .. note:: Explain the printed information during tuning -# -# During the tuning, a lot of information will be printed on the console. -# They are used for debugging purposes. The most important info is the output -# of the task scheduler. The following table is a sample output. -# -# .. code-block:: c -# -# ---------------------------------------------------------------------- -# ------------------------------ [ Task Scheduler ] -# ---------------------------------------------------------------------- -# | ID | Latency (ms) | Speed (GFLOPS) | Trials | -# ------------------------------------------------- -# | 0 | 0.010 | 0.40 | 64 | -# | 1 | 0.087 | 47.19 | 64 | -# | 2 | 0.008 | -0.00 | 64 | -# | 3 | 0.177 | 582.07 | 64 | -# | 4 | 0.268 | 862.37 | 256 | -# | 5 | 0.166 | 621.13 | 128 | -# | 6 | 0.170 | 605.10 | 128 | -# | 7 | 0.128 | 403.20 | 64 | -# | 8 | 0.189 | 545.71 | 64 | -# | 9 | 0.231 | 1001.01 | 448 | -# | 10 | 0.155 | 664.80 | 256 | -# | 11 | 0.155 | 662.86 | 256 | -# | 12 | 0.119 | 434.08 | 64 | -# | 13 | 0.199 | 522.13 | 64 | -# | 14 | 0.235 | 986.56 | 320 | -# | 15 | 0.149 | 689.13 | 128 | -# | 16 | 0.155 | 664.80 | 192 | -# | 17 | 0.151 | 340.64 | 64 | -# | 18 | 0.176 | 597.55 | 128 | -# | 19 | 0.220 | 1054.37 | 192 | -# | 20 | 0.150 | 686.01 | 128 | -# | 21 | 0.159 | 650.88 | 128 | -# | 22 | 0.073 | 358.19 | 64 | -# | 23 | 0.031 | 70.63 | 64 | -# | 24 | 0.251 | 947.73 | 128 | -# | 25 | 0.157 | 652.47 | 128 | -# | 26 | 0.215 | 954.84 | 128 | -# | 27 | 0.237 | 868.92 | 128 | -# | 28 | 0.266 | 774.06 | 128 | -# ------------------------------------------------- -# Estimated total latency: 10.016 ms Trials: 3992 Used time : 1131 s Next ID: 15 -# -# This table lists the latency and (estimated) speed of all tasks. -# It also lists the allocation of measurement trials for all tasks. -# The last line prints the total weighted latency of these tasks, -# which can be a rough estimation of the end-to-end execution time -# of the network. -# The last line also prints the total number of measurement trials, -# total time spent on auto-tuning and the id of the next task to tune. -# -# There will also be some "tvm::Error"s errors, because the -# auto-scheduler will try some invalid schedules. -# You can safely ignore them if the tuning can continue, because these -# errors are isolated from the main process. -# - -###################################################################### -# .. note:: Terminate the tuning earlier -# -# You can terminate the tuning earlier by forcibly killing this process. -# As long as you get at least one valid schedule for each task in the log file, -# you should be able to do the compilation (the secion below). -# - - -################################################################# -# Compile and Evaluate -# -------------------- -# After auto-tuning, we can compile the network with the best schedules we found. -# All measurement records are dumped into the log file during auto-tuning, -# so we can read the log file and load the best schedules. - -# Compile with the history best -print("Compile...") -with auto_scheduler.ApplyHistoryBest(log_file): - with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}): - lib = relay.build(mod, target=target, params=params) - -# Create graph executor -dev = tvm.device(str(target), 0) -module = graph_executor.GraphModule(lib["default"](dev)) -data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) -module.set_input("data", data_tvm) - -# Evaluate -print("Evaluate inference time cost...") -print(module.benchmark(dev, repeat=3, min_repeat_ms=500)) - - -################################################################# -# Other Tips -# ---------- -# 1. During the tuning, the auto-scheduler needs to compile many programs and -# extract feature from them. This part is CPU-intensive, -# so a high-performance CPU with many cores is recommended for faster search. -# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` -# to distill the large log file and only save the best useful records. -# 3. You can resume a search from the previous log file. You just need to -# add a new argument :code:`load_log_file` when creating the task scheduler -# in function :code:`run_tuning`. Say, -# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` -# 4. If you have multiple target CPUs, you can use all of them for measurements to -# parallelize the measurements. Check this :ref:`section ` -# to learn how to use the RPC Tracker and RPC Server. -# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` -# with :any:`auto_scheduler.RPCRunner`. diff --git a/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py deleted file mode 100644 index a9c45c15ce82..000000000000 --- a/gallery/how_to/tune_with_autoscheduler/tune_sparse_x86.py +++ /dev/null @@ -1,330 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule -=========================================================================== -**Author**: `Chengfan Jia `_ - -This is a tutorial on how to use the auto-scheduler to tune a sparse matrix multiplication for -CPUs. - -Auto-scheduler is designed to explore the schedule with best performance for a given computation -declaration automatically. While sometimes, we may have a demand to try some special ops which may -not been well-supported by auto-scheduler's default sketch rules and result in poor performance. -Fortunately, auto-scheduler currently allows user to provide a CustomSketch to cover these cases. - -We use sparse matrix multiplication as an example in this tutorial to demonstrate how to implement -and plug a custom sketch rule to the auto-scheduler's search policy. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - - -import os - -import numpy as np -import tvm -import tvm.testing -from tvm import te, auto_scheduler, runtime, topi -from tvm.auto_scheduler import _ffi_api -from tvm.topi.utils import get_const_tuple -from tvm.topi.sparse.utils import random_bsr_matrix - -###################################################################### -# Define the computation -# ^^^^^^^^^^^^^^^^^^^^^^ -# To begin with, let us define the computation of a sparse matmul with several relu and bias add. -# The function should return the list of input/output tensors. -# From these tensors, the auto-scheduler can get the whole computational graph. - - -@auto_scheduler.register_workload -def sparse_dense(M, N, K, w_data_shape, w_indices_shape, w_indptr_shape, dtype): - X = te.placeholder(shape=(M, K), dtype=dtype) - W_data = te.placeholder(shape=w_data_shape, dtype=dtype) - W_indices = te.placeholder(shape=w_indices_shape, dtype="int32") - W_indptr = te.placeholder(shape=w_indptr_shape, dtype="int32") - B = te.placeholder(shape=(M, N), dtype=dtype) - - out = topi.nn.sparse_dense(topi.nn.relu(X), W_data, W_indices, W_indptr) - out = te.compute((M, N), lambda i, j: out[i, j] + B[i, j], name="BiasAdd") - out = topi.nn.relu(out) - - return [X, W_data, W_indices, W_indptr, B, out] - - -###################################################################### -# Special step for sparse workload -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# During schedule tuning, auto-scheduler will use random inputs to measure the performance of a -# generated schedule. While we cannot directly use a random array as the input of a sparse op, for -# the "indices" and "indptr" array are meaningful for the computation. -# -# To solve this problem, we register these as special buffers, and load them when process program -# measuring. -# See the `tvm.auto_scheduler.measure.py` for more details. - -# Define the basic shapes of this sparse computation -M = 128 -K = 256 -N = 512 -BS_R = 16 -BS_C = 1 -density = 0.6 - -# Generate the test data with numpy -X_np = np.random.randn(M, K).astype("float32") -X_np = np.maximum(np.zeros((M, K), dtype="float32"), X_np) # Relu -W_sp_np = random_bsr_matrix(N, K, BS_R, BS_C, density=density, dtype="float32") -W_np = W_sp_np.todense() -Y_np = X_np @ W_np.T # Process the matrix multiplication -B_np = np.random.randn(M, N).astype("float32") -Y_np = Y_np + B_np # Bias add -Y_np = np.maximum(np.zeros((M, N), dtype="float32"), Y_np) # Relu - - -###################################################################### -# Create the search task -# ^^^^^^^^^^^^^^^^^^^^^^ -# We then create a search task with M=N=K=512 and dtype="float32" -# If your machine supports avx instructions, you can -# -# - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2 -# - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512 - -target = tvm.target.Target("llvm") - -# Register the sparse data to task inputs -prefix = "sparse_dense_bsr_%d_%d_%d_%d_%d_%d_" % ( - N, - K, - BS_R, - BS_C, - W_sp_np.indices.shape[0], - W_sp_np.indptr.shape[0], -) -task = tvm.auto_scheduler.SearchTask( - func=sparse_dense, - args=(M, N, K, W_sp_np.data.shape, W_sp_np.indices.shape, W_sp_np.indptr.shape, "float32"), - target=target, - task_inputs={ - prefix + "W_data": runtime.ndarray.array(W_sp_np.data), - prefix + "W_indices": runtime.ndarray.array(W_sp_np.indices), - prefix + "W_indptr": runtime.ndarray.array(W_sp_np.indptr), - }, - task_inputs_save_to_file=True, -) - -# Inspect the computational graph -print("Computational DAG:") -print(task.compute_dag) - -###################################################################### -# Write the custom sketch for sparse dense op -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# Before tuning, we will need to define the CustomSketchRule for the sparse dense op. -# -# CustomSketchRule consists of two parts: the condition function and the apply function. -# -# - condition function: describe when to apply this sketch rule. For example, we can only apply -# the rule to the sparse ops by matching their name and tag. -# - apply function: describe how to generate the initial sketch. You can implement it using -# auto-scheduler provided loop state APIs. - - -def meet_condition_func(search_policy, state, stage_id): - state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag) - if state.stages[stage_id].op.tag in [ - "sparse_dense_sp_rhs_bsrmm", - "sparse_dense_sp_rhs_bsrmm_block", - ]: - return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST - else: - return auto_scheduler.PreloadCustomSketchRule.PASS - - -def apply_func(search_policy, state, stage_id): - ret = [] - s0 = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag) - if s0.stages[stage_id].op.tag == "sparse_dense_sp_rhs_bsrmm_block": - return [s0.state_object, stage_id - 1] - - sparse_dense = s0.stages[stage_id].op - sparse_dense_block = s0.stages[stage_id - 1].op - assert sparse_dense.tag == "sparse_dense_sp_rhs_bsrmm" - assert sparse_dense_block.tag == "sparse_dense_sp_rhs_bsrmm_block" - - # Set the default consumer of compute block - consumer = sparse_dense - - # If sparse dense has a single elementwise consumer - # We can compute inline the sparse_dense output stage - consumers = _ffi_api.SearchPolicyUtilsGetConsumers( - search_policy.search_task, s0.state_object, stage_id - ) - if len(consumers) == 1: - consumer_id = int(consumers.items()[0][0]) - if _ffi_api.SearchPolicyUtilsIsElementwiseMatch( - search_policy.search_task, s0.state_object, stage_id, consumer_id - ): - consumer = s0.stages[consumer_id].op - s0.compute_inline(sparse_dense) - - i, nb_j, j, row_offset, c = s0[sparse_dense_block].iters - m, n = s0[consumer].iters - i0, i1, i2 = s0.split(sparse_dense_block, i, [None, None]) - m0, m1 = s0.follow_split(consumer, m, len(s0.transform_steps) - 1, 1) - j0, j1 = s0.split(sparse_dense_block, nb_j, [None]) - n0, n1 = s0.follow_split(consumer, n, len(s0.transform_steps) - 1, 1) - s0.reorder(sparse_dense_block, [i0, j0, i1, j1, row_offset, i2, j, c]) - s0.reorder(consumer, [m0, n0, m1, n1]) - s0.compute_at(sparse_dense_block, consumer, n0) - - ret.append([s0.state_object, stage_id - 2]) - - return ret - - -###################################################################### -# Next, we set parameters for the auto-scheduler with the custom sketch plugged in. -# -# * :code:`num_measure_trials` is the number of measurement trials we can use during the search. -# We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a -# good value for the search to converge. You can do more trials according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a file -# `sparse_dense.json`. -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. -# * see :any:`auto_scheduler.TuningOptions` for more parameters -# * Here, we need to create a :code:`auto_scheduler.SketchPolicy` object, and add the custom sketch -# rule as a `init_search_callbacks`. - -log_file = "sparse_dense.json" -tune_option = auto_scheduler.TuningOptions( - num_measure_trials=10, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - verbose=2, -) - -search_policy = auto_scheduler.SketchPolicy( - task, - program_cost_model=auto_scheduler.XGBModel(), - init_search_callbacks=[ - auto_scheduler.PreloadCustomSketchRule(meet_condition_func, apply_func, "SparseDense") - ], -) - - -###################################################################### -# Run the search -# ^^^^^^^^^^^^^^ -# Now we get all inputs ready. -# We can kick off the search and let the auto-scheduler do its magic. -# After some measurement trials, we can load the best schedule from the log -# file and apply it. - - -def tune_and_evaluate(tune_option, search_policy): - # Run auto-tuning (search) - task.tune(tune_option, search_policy) - - # Apply the best schedule - sch, args = task.apply_best(log_file) - - # We can lower the schedule to see the IR after auto-scheduling. - # The auto-scheduler correctly performs optimizations including multi-level tiling, - # layout transformation, parallelization, vectorization, unrolling, and operator fusion. - print("Lowered TIR:") - print(tvm.lower(sch, args, simple_mode=True)) - - # Check correctness and evaluate performance - # We build the binary and check its correctness and performance. - func = tvm.build(sch, args, target) - - dev = tvm.cpu() - - X_tvm = tvm.nd.array(X_np, device=dev) - W_data_tvm = tvm.nd.array(W_sp_np.data, device=dev) - W_indices_tvm = tvm.nd.array(W_sp_np.indices, device=dev) - W_indptr_tvm = tvm.nd.array(W_sp_np.indptr, device=dev) - B_tvm = tvm.nd.array(B_np, device=dev) - Y_tvm = tvm.nd.empty(Y_np.shape, device=dev) - - func(X_tvm, W_data_tvm, W_indices_tvm, W_indptr_tvm, B_tvm, Y_tvm) - - # Check results - tvm.testing.assert_allclose(Y_np, Y_tvm.numpy(), atol=1e-4, rtol=1e-4) - - # Evaluate execution time. - evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) - print( - "Execution time of this operator: %.3f ms" - % ( - np.median( - evaluator(X_tvm, W_data_tvm, W_indices_tvm, W_indptr_tvm, B_tvm, Y_tvm).results - ) - * 1000 - ) - ) - - -# Notice: We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. -# tune_and_evaluate(tune_option, search_policy) - - -###################################################################### -# .. note:: Tuning result example -# -# .. code-block:: c -# -# ---------------------------------------------------------------------- -# Lowered TIR: -# primfn(placeholder_5: handle, placeholder_6: handle, placeholder_7: handle, placeholder_8: handle, placeholder_9: handle, compute_1: handle) -> () -# attr = {"global_symbol": "main", "tir.noalias": True} -# buffers = {placeholder_2: Buffer(placeholder_10: Pointer(float32), float32, [9831, 16, 1], []), -# placeholder_4: Buffer(placeholder_11: Pointer(int32), int32, [33], []), -# placeholder_3: Buffer(placeholder_12: Pointer(float32), float32, [512, 512], []), -# compute: Buffer(compute_2: Pointer(float32), float32, [512, 512], []), -# placeholder_1: Buffer(placeholder_13: Pointer(float32), float32, [512, 512], []), -# placeholder: Buffer(placeholder_14: Pointer(int32), int32, [9831], [])} -# buffer_map = {placeholder_7: placeholder, placeholder_9: placeholder_1, placeholder_6: placeholder_2, compute_1: compute, placeholder_5: placeholder_3, placeholder_8: placeholder_4} { -# for (i0.outer.i1.outer.fused: int32, 0, 1024) "parallel" { -# attr [compute_3: Pointer(float32)] "storage_scope" = "global"; -# allocate(compute_3, float32, [256]) { -# for (nb_j.inner: int32, 0, 2) { -# for (i.inner.init: int32, 0, 8) { -# for (j.init: int32, 0, 16) { -# compute_3[(((i.inner.init*32) + (nb_j.inner*16)) + j.init)] = 0f32 -# } -# } -# for (elem_idx: int32, 0, ((int32*)placeholder_11[(((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) + 1)] - (int32*)placeholder_11[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)])) { -# for (i.inner: int32, 0, 8) { -# for (j: int32, 0, 16) { -# compute_3[(((i.inner*32) + (nb_j.inner*16)) + j)] = ((float32*)compute_3[(((i.inner*32) + (nb_j.inner*16)) + j)] + ((float32*)placeholder_10[((((int32*)placeholder_11[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)]*16) + (elem_idx*16)) + j)]*max((float32*)placeholder_12[(((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i.inner*512)) + (int32*)placeholder_14[((int32*)placeholder_11[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)] + elem_idx)])], 0f32))) -# } -# } -# } -# } -# for (i0.inner: int32, 0, 8) { -# compute_2[ramp((((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)), 1, 32)] = max(((float32x32*)compute_3[ramp((i0.inner*32), 1, 32)] + (float32x32*)placeholder_13[ramp((((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)), 1, 32)]), broadcast(0f32, 32)) -# } -# } -# } -# } diff --git a/gallery/how_to/tune_with_autotvm/README.txt b/gallery/how_to/tune_with_autotvm/README.txt deleted file mode 100644 index 2b32c085c2fd..000000000000 --- a/gallery/how_to/tune_with_autotvm/README.txt +++ /dev/null @@ -1,9 +0,0 @@ -.. _tutorials-autotvm-sec: - -Auto-Tune with Templates and AutoTVM ------------------------------------- - -AutoTVM offers a way to tune models and operators by providing a template -schedule, and searcing the parameter space defined by the template. These -how-tos demonstrate how to write template schedules and optimize them for a -variety of different hardware platforms. diff --git a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py b/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py deleted file mode 100644 index 503977aed568..000000000000 --- a/gallery/how_to/tune_with_autotvm/tune_conv2d_cuda.py +++ /dev/null @@ -1,253 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Tuning High Performance Convolution on NVIDIA GPUs -========================================================================= -**Author**: `Lianmin Zheng `_ - -This is an advanced tutorial for writing high performance tunable template for -NVIDIA GPU. By running auto-tuner on this template, we can outperform the -vendor provided library CuDNN in many cases. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -###################################################################### -# Install dependencies -# -------------------- -# To use autotvm package in tvm, we need to install some extra dependencies. -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost tornado cloudpickle -# -# To make TVM run faster in tuning, it is recommended to use cython -# as FFI of tvm. In the root directory of tvm, execute -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Import packages. - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import logging -import sys -import numpy as np - -import tvm -from tvm import te, topi, testing -from tvm.topi.testing import conv2d_nchw_python -import tvm.testing - -from tvm import autotvm - -###################################################################### -# Step 1: Define the search space -# -------------------------------- -# There are plenty of useful schedule primitives in tvm. You can also find -# some tutorials that describe them in more details, such as -# (1). :ref:`opt-conv-gpu` -# (2). `Optimizing DepthwiseConv on NVIDIA GPU `_ -# -# However, their implementations are manually tuned for some special input -# shapes. In this section, we build a large enough space to cover -# the techniques used in these tutorials. Then we rely on the efficient auto-tuner -# to search through this space and pick some good configurations. -# -# If you are familiar with writing cuda schedule, you can find the following -# template is very general. Actually this template can be easily modified -# to tune other operators such as depthwise convolution and GEMM. -# In order to fully understand this template, you should be familiar with -# the schedule primitives and auto tuning API. You can refer to the above -# tutorials and :ref:`autotvm tutorial ` -# -# It is worth noting that the search space for a conv2d operator -# can be very large (at the level of 10^9 for some input shapes) -# - - -@autotvm.template("tutorial/conv2d_no_batching") -def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding): - assert N == 1, "Only consider batch_size = 1 in this template" - - data = te.placeholder((N, CI, H, W), name="data") - kernel = te.placeholder((CO, CI, KH, KW), name="kernel") - conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype="float32") - s = te.create_schedule([conv.op]) - - ##### space definition begin ##### - n, f, y, x = s[conv].op.axis - rc, ry, rx = s[conv].op.reduce_axis - - cfg = autotvm.get_config() - cfg.define_split("tile_f", f, num_outputs=4) - cfg.define_split("tile_y", y, num_outputs=4) - cfg.define_split("tile_x", x, num_outputs=4) - cfg.define_split("tile_rc", rc, num_outputs=3) - cfg.define_split("tile_ry", ry, num_outputs=3) - cfg.define_split("tile_rx", rx, num_outputs=3) - cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) - cfg.define_knob("unroll_explicit", [0, 1]) - ##### space definition end ##### - - # inline padding - pad_data = s[conv].op.input_tensors[0] - s[pad_data].compute_inline() - data, raw_data = pad_data, data - - output = conv - OL = s.cache_write(conv, "local") - - # create cache stage - AA = s.cache_read(data, "shared", [OL]) - WW = s.cache_read(kernel, "shared", [OL]) - AL = s.cache_read(AA, "local", [OL]) - WL = s.cache_read(WW, "local", [OL]) - - # tile and bind spatial axes - n, f, y, x = s[output].op.axis - bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) - by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) - bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) - kernel_scope = n # this is the scope to attach global config inside this kernel - - s[output].bind(bf, te.thread_axis("blockIdx.z")) - s[output].bind(by, te.thread_axis("blockIdx.y")) - s[output].bind(bx, te.thread_axis("blockIdx.x")) - s[output].bind(vf, te.thread_axis("vthread")) - s[output].bind(vy, te.thread_axis("vthread")) - s[output].bind(vx, te.thread_axis("vthread")) - s[output].bind(tf, te.thread_axis("threadIdx.z")) - s[output].bind(ty, te.thread_axis("threadIdx.y")) - s[output].bind(tx, te.thread_axis("threadIdx.x")) - s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) - s[OL].compute_at(s[output], tx) - - # tile reduction axes - n, f, y, x = s[OL].op.axis - rc, ry, rx = s[OL].op.reduce_axis - rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc) - ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry) - rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx) - s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x) - - s[AA].compute_at(s[OL], rxo) - s[WW].compute_at(s[OL], rxo) - s[AL].compute_at(s[OL], rxm) - s[WL].compute_at(s[OL], rxm) - - # cooperative fetching - for load in [AA, WW]: - n, f, y, x = s[load].op.axis - fused = s[load].fuse(n, f, y, x) - tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) - ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) - tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) - s[load].bind(tz, te.thread_axis("threadIdx.z")) - s[load].bind(ty, te.thread_axis("threadIdx.y")) - s[load].bind(tx, te.thread_axis("threadIdx.x")) - - # tune unroll - s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) - s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) - - return s, [raw_data, kernel, conv] - - -###################################################################### -# Step 2: Search through the space -# --------------------------------- -# We pick the last layer on resnet as test case. -# Since our space is very large, :code:`XGBoostTuner` is most suitable -# for our case. Here we only do 20 trials for demonstration. -# In practice, making 1000 trials usually can find some good kernels -# for this template - -# logging config (for printing tuning log to screen) -logging.getLogger("autotvm").setLevel(logging.DEBUG) -logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout)) - -# the last layer in resnet -N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) -task = autotvm.task.create( - "tutorial/conv2d_no_batching", args=(N, H, W, CO, CI, KH, KW, strides, padding), target="cuda" -) -print(task.config_space) - -# Use local gpu, measure 10 times for every config to reduce variance -# The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds -measure_option = autotvm.measure_option( - builder=autotvm.LocalBuilder(), - runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4), -) - -record_file = None -# Begin tuning, log records to file `conv2d.log` -# During tuning we will also try many invalid configs, so you are expected to -# see many error reports. As long as you can see non-zero GFLOPS, it is okay. - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following lines to run it by yourself. - -# tuner = autotvm.tuner.XGBTuner(task) -# record_file = "conv2d.log" -# tuner.tune( -# n_trial=5, -# measure_option=measure_option, -# callbacks=[autotvm.callback.log_to_file(record_file)], -# ) - -######################################################################### -# Finally we can inspect the best config from log file, check correctness, -# and measure running time. - -# inspect the best config -dispatch_context = autotvm.apply_history_best(record_file) -best_config = dispatch_context.query(task.target, task.workload) -print("\nBest config:") -print(best_config) - -# apply history best from log file -with autotvm.apply_history_best(record_file): - with tvm.target.Target("cuda"): - s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) - func = tvm.build(s, arg_bufs) - -# check correctness -a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) -w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) -c_np = conv2d_nchw_python(a_np, w_np, strides, padding) - -dev = tvm.cuda() -a_tvm = tvm.nd.array(a_np, device=dev) -w_tvm = tvm.nd.array(w_np, device=dev) -c_tvm = tvm.nd.empty(c_np.shape, device=dev) -func(a_tvm, w_tvm, c_tvm) - -tvm.testing.assert_allclose(c_np, c_tvm.numpy(), rtol=1e-2) - -# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise -# and the overhead of kernel launch. You can also use nvprof to validate the result. -evaluator = func.time_evaluator(func.entry_name, dev, number=400) -print("Time cost of this operator: %f" % evaluator(a_tvm, w_tvm, c_tvm).mean) diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_arm.py b/gallery/how_to/tune_with_autotvm/tune_relay_arm.py deleted file mode 100644 index 13674f5cdac0..000000000000 --- a/gallery/how_to/tune_with_autotvm/tune_relay_arm.py +++ /dev/null @@ -1,422 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tune_relay_arm: - -Auto-tuning a Convolutional Network for ARM CPU -=============================================== -**Author**: `Lianmin Zheng `_, `Zhao Wu `_, `Eddie Yan `_ - -Auto-tuning for a specific ARM device is critical for getting the best -performance. This is a tutorial about how to tune a whole convolutional -network. - -The operator implementation for ARM CPU in TVM is written in template form. -The template has many tunable knobs (tile factor, vectorization, unrolling, etc). -We will tune all convolution and depthwise convolution operators -in the neural network. After tuning, we produce a log file which stores -the best knob values for all required operators. When the TVM compiler compiles -these operators, it will query this log file to get the best knob values. - -We also released pre-tuned parameters for some arm devices. You can go to -`ARM CPU Benchmark `_ -to see the results. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -###################################################################### -# Install dependencies -# -------------------- -# To use the autotvm package in tvm, we need to install some extra dependencies. -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost tornado cloudpickle -# -# To make TVM run faster during tuning, it is recommended to use cython -# as FFI of TVM. In the root directory of TVM, execute -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Import packages. - - -import os - -import numpy as np -import tvm -from tvm import relay, autotvm -import tvm.relay.testing -from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner -from tvm.contrib.utils import tempdir -import tvm.contrib.graph_executor as runtime - -################################################################# -# Define network -# -------------- -# First we need to define the network in relay frontend API. -# We can load some pre-defined network from :code:`relay.testing`. -# We can also load models from MXNet, ONNX and TensorFlow. - - -def get_network(name, batch_size): - """Get the symbol definition and random weight of a network""" - input_shape = (batch_size, 3, 224, 224) - output_shape = (batch_size, 1000) - - if "resnet" in name: - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, batch_size=batch_size, dtype=dtype - ) - elif "vgg" in name: - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.vgg.get_workload( - num_layers=n_layer, batch_size=batch_size, dtype=dtype - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size) - elif name == "squeezenet_v1.1": - mod, params = relay.testing.squeezenet.get_workload( - batch_size=batch_size, version="1.1", dtype=dtype - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - else: - raise ValueError("Unsupported network: " + name) - - return mod, params, input_shape, output_shape - - -################################################################# -# Start RPC Tracker -# ----------------- -# TVM uses RPC session to communicate with ARM boards. -# During tuning, the tuner will send the generated code to the board and -# measure the speed of code on the board. -# -# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices. -# The RPC Tracker is a centralized controller node. We can register all devices to -# the tracker. For example, if we have 10 phones, we can register all of them -# to the tracker, and run 10 measurements in parallel, accelerating the tuning process. -# -# To start an RPC tracker, run this command on the host machine. The tracker is -# required during the whole tuning process, so we need to open a new terminal for -# this command: -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190 -# -# The expected output is -# -# .. code-block:: bash -# -# INFO:RPCTracker:bind to 0.0.0.0:9190 - -################################################################# -# Register Devices to RPC Tracker -# ----------------------------------- -# Now we can register our devices to the tracker. The first step is to -# build the TVM runtime for the ARM devices. -# -# * For Linux: -# Follow this section :ref:`build-tvm-runtime-on-device` to build -# the TVM runtime on the device. Then register the device to tracker by -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399 -# -# (replace :code:`[HOST_IP]` with the IP address of your host machine) -# -# * For Android: -# Follow this `readme page `_ to -# install the TVM RPC APK on the android device. Make sure you can pass the android rpc test. -# Then you have already registered your device. During tuning, you have to go to developer option -# and enable "Keep screen awake during changing" and charge your phone to make it stable. -# -# After registering devices, we can confirm it by querying rpc_tracker -# -# .. code-block:: bash -# -# python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 -# -# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 3B and 2 rk3399, -# the output can be -# -# .. code-block:: bash -# -# Queue Status -# ---------------------------------- -# key total free pending -# ---------------------------------- -# mate10pro 2 2 0 -# rk3399 2 2 0 -# rpi3b 11 11 0 -# ---------------------------------- -# -# You can register multiple devices to the tracker to accelerate the measurement in tuning. - -########################################### -# Set Tuning Options -# ------------------ -# Before tuning, we should apply some configurations. Here I use an RK3399 board -# as example. In your setting, you should modify the target and device_key accordingly. -# set :code:`use_android` to True if you use android phone. - -#### DEVICE CONFIG #### - -# Replace "aarch64-linux-gnu" with the correct target of your board. -# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device. -target = tvm.target.Target("llvm -device=arm_cpu -mtriple=aarch64-linux-gnu") - -# Also replace this with the device key in your tracker -device_key = "rk3399" - -# Set this to True if you use android phone -use_android = False - -#### TUNING OPTION #### -network = "resnet-18" -log_file = "%s.%s.log" % (device_key, network) -dtype = "float32" - -tuning_option = { - "log_filename": log_file, - "tuner": "xgb", - "n_trial": 1500, - "early_stopping": 800, - "measure_option": autotvm.measure_option( - builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"), - runner=autotvm.RPCRunner( - device_key, - host="127.0.0.1", - port=9190, - number=5, - timeout=10, - ), - ), -} - -#################################################################### -# -# .. note:: How to set tuning options -# -# In general, the default values provided here work well. -# If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, -# which makes the tuning run longer. -# If your device runs very slow or your conv2d operators have many GFLOPs, considering to -# set timeout larger. -# -# If your model has depthwise convolution, you could consider setting -# :code:`try_spatial_pack_depthwise` be :code:`True`, which perform better than default -# optimization in general. For example, on ARM CPU A53 2.0GHz, we find it could boost 1.6x -# performance of depthwise convolution on Mobilenet V1 model. - -################################################################### -# Begin Tuning -# ------------ -# Now we can extract tuning tasks from the network and begin tuning. -# Here, we provide a simple utility function to tune a list of tasks. -# This function is just an initial implementation which tunes them in sequential order. -# We will introduce a more sophisticated tuning scheduler in the future. - -# You can skip the implementation of this function for this tutorial. -def tune_tasks( - tasks, - measure_option, - tuner="xgb", - n_trial=1000, - early_stopping=None, - log_filename="tuning.log", - use_transfer_learning=True, -): - # create tmp log file - tmp_log_file = log_filename + ".tmp" - if os.path.exists(tmp_log_file): - os.remove(tmp_log_file) - - for i, tsk in enumerate(reversed(tasks)): - prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) - - # create tuner - if tuner == "xgb": - tuner_obj = XGBTuner(tsk, loss_type="reg") - elif tuner == "xgb_knob": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob") - elif tuner == "xgb_itervar": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar") - elif tuner == "xgb_curve": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve") - elif tuner == "xgb_rank": - tuner_obj = XGBTuner(tsk, loss_type="rank") - elif tuner == "xgb_rank_knob": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob") - elif tuner == "xgb_rank_itervar": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar") - elif tuner == "xgb_rank_curve": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve") - elif tuner == "xgb_rank_binary": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary") - elif tuner == "xgb_rank_binary_knob": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob") - elif tuner == "xgb_rank_binary_itervar": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar") - elif tuner == "xgb_rank_binary_curve": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve") - elif tuner == "ga": - tuner_obj = GATuner(tsk, pop_size=50) - elif tuner == "random": - tuner_obj = RandomTuner(tsk) - elif tuner == "gridsearch": - tuner_obj = GridSearchTuner(tsk) - else: - raise ValueError("Invalid tuner: " + tuner) - - if use_transfer_learning: - if os.path.isfile(tmp_log_file): - tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) - - # process tuning - tsk_trial = min(n_trial, len(tsk.config_space)) - tuner_obj.tune( - n_trial=tsk_trial, - early_stopping=early_stopping, - measure_option=measure_option, - callbacks=[ - autotvm.callback.progress_bar(tsk_trial, prefix=prefix), - autotvm.callback.log_to_file(tmp_log_file), - ], - ) - - # pick best records to a cache file - autotvm.record.pick_best(tmp_log_file, log_filename) - os.remove(tmp_log_file) - - -######################################################################## -# Finally, we launch tuning jobs and evaluate the end-to-end performance. - - -def tune_and_evaluate(tuning_opt): - # extract workloads from relay program - print("Extract tasks...") - mod, params, input_shape, _ = get_network(network, batch_size=1) - tasks = autotvm.task.extract_from_program( - mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),) - ) - - # run tuning tasks - print("Tuning...") - tune_tasks(tasks, **tuning_opt) - - # compile kernels with history best records - with autotvm.apply_history_best(log_file): - print("Compile...") - with tvm.transform.PassContext(opt_level=3): - lib = relay.build_module.build(mod, target=target, params=params) - - # export library - tmp = tempdir() - if use_android: - from tvm.contrib import ndk - - filename = "net.so" - lib.export_library(tmp.relpath(filename), fcompile=ndk.create_shared) - else: - filename = "net.tar" - lib.export_library(tmp.relpath(filename)) - - # upload module to device - print("Upload...") - remote = autotvm.measure.request_remote(device_key, "127.0.0.1", 9190, timeout=10000) - remote.upload(tmp.relpath(filename)) - rlib = remote.load_module(filename) - - # upload parameters to device - dev = remote.device(str(target), 0) - module = runtime.GraphModule(rlib["default"](dev)) - data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) - module.set_input("data", data_tvm) - - # evaluate - print("Evaluate inference time cost...") - print(module.benchmark(dev, number=1, repeat=10)) - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. - -# tune_and_evaluate(tuning_option) - -###################################################################### -# Sample Output -# ------------- -# The tuning needs to compile many programs and extract feature from them. -# So a high performance CPU is recommended. -# One sample output is listed below. -# It takes about 2 hours on a 32T AMD Ryzen Threadripper. -# -# .. code-block:: bash -# -# Extract tasks... -# Tuning... -# [Task 1/12] Current/Best: 22.37/ 52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done. -# [Task 2/12] Current/Best: 6.51/ 18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done. -# [Task 3/12] Current/Best: 4.67/ 24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done. -# [Task 4/12] Current/Best: 11.35/ 46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done. -# [Task 5/12] Current/Best: 1.01/ 19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done. -# [Task 6/12] Current/Best: 2.47/ 23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done. -# [Task 7/12] Current/Best: 14.57/ 33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done. -# [Task 8/12] Current/Best: 1.13/ 17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done. -# [Task 9/12] Current/Best: 14.45/ 22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done. -# [Task 10/12] Current/Best: 3.22/ 15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done. -# [Task 11/12] Current/Best: 11.03/ 32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done. -# [Task 12/12] Current/Best: 8.00/ 21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done. -# Compile... -# Upload... -# Evaluate inference time cost... -# Mean inference time (std dev): 162.59 ms (0.06 ms) - -###################################################################### -# -# .. note:: **Experiencing Difficulties?** -# -# The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS", -# then there must be something wrong. -# -# First, make sure you set the correct configuration of your device. -# Then, you can print debug information by adding these lines in the beginning -# of the script. It will print every measurement result, where you can find useful -# error messages. -# -# .. code-block:: python -# -# import logging -# logging.getLogger('autotvm').setLevel(logging.DEBUG) -# -# Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py b/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py deleted file mode 100644 index 53d56c709ddf..000000000000 --- a/gallery/how_to/tune_with_autotvm/tune_relay_cuda.py +++ /dev/null @@ -1,398 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-tuning a Convolutional Network for NVIDIA GPU -================================================== -**Author**: `Lianmin Zheng `_, `Eddie Yan `_ - -Auto-tuning for specific devices and workloads is critical for getting the -best performance. This is a tutorial on how to tune a whole convolutional -network for NVIDIA GPU. - -The operator implementation for NVIDIA GPU in TVM is written in template form. -The template has many tunable knobs (tile factor, unrolling, etc). -We will tune all convolution and depthwise convolution operators -in the neural network. After tuning, we produce a log file which stores -the best knob values for all required operators. When the TVM compiler compiles -these operators, it will query this log file to get the best knob values. - -We also released pre-tuned parameters for some NVIDIA GPUs. You can go to -`NVIDIA GPU Benchmark `_ -to see the results. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -###################################################################### -# Install dependencies -# -------------------- -# To use the autotvm package in tvm, we need to install some extra dependencies. -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost tornado cloudpickle -# -# To make TVM run faster during tuning, it is recommended to use cython -# as FFI of tvm. In the root directory of tvm, execute: -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Import packages. - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import os - -import numpy as np - -import tvm -from tvm import relay, autotvm -import tvm.relay.testing -from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner -import tvm.contrib.graph_executor as runtime - -################################################################# -# Define Network -# -------------- -# First we need to define the network in relay frontend API. -# We can load some pre-defined network from :code:`tvm.relay.testing`. -# We can also load models from MXNet, ONNX and TensorFlow. - - -def get_network(name, batch_size): - """Get the symbol definition and random weight of a network""" - input_shape = (batch_size, 3, 224, 224) - output_shape = (batch_size, 1000) - - if "resnet" in name: - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, batch_size=batch_size, dtype=dtype - ) - elif "vgg" in name: - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.vgg.get_workload( - num_layers=n_layer, batch_size=batch_size, dtype=dtype - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype) - elif name == "squeezenet_v1.1": - mod, params = relay.testing.squeezenet.get_workload( - batch_size=batch_size, version="1.1", dtype=dtype - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - else: - raise ValueError("Unsupported network: " + name) - - return mod, params, input_shape, output_shape - - -########################################### -# Set Tuning Options -# ------------------ -# Before tuning, we apply some configurations. - -#### DEVICE CONFIG #### -target = tvm.target.cuda() - -#### TUNING OPTION #### -network = "resnet-18" -log_file = "%s.log" % network -dtype = "float32" - -tuning_option = { - "log_filename": log_file, - "tuner": "xgb", - "n_trial": 2000, - "early_stopping": 600, - "measure_option": autotvm.measure_option( - builder=autotvm.LocalBuilder(timeout=10), - runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150), - ), -} - -#################################################################### -# -# .. note:: How to set tuning options -# -# In general, the default value provided here works well. -# -# If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, -# which makes the tuning runs longer. -# -# If you have multiple devices, you can use all of them for measurement to -# accelerate the tuning process. (see the 'Scale up measurement` section below). -# - -################################################################### -# Begin Tuning -# ------------ -# Now we can extract tuning tasks from the network and begin tuning. -# Here, we provide a simple utility function to tune a list of tasks. -# This function is just an initial implementation which tunes them in sequential order. -# We will introduce a more sophisticated tuning scheduler in the future. - -# You can skip the implementation of this function for this tutorial. -def tune_tasks( - tasks, - measure_option, - tuner="xgb", - n_trial=1000, - early_stopping=None, - log_filename="tuning.log", - use_transfer_learning=True, -): - # create tmp log file - tmp_log_file = log_filename + ".tmp" - if os.path.exists(tmp_log_file): - os.remove(tmp_log_file) - - for i, tsk in enumerate(reversed(tasks)): - prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) - - # create tuner - if tuner == "xgb": - tuner_obj = XGBTuner(tsk, loss_type="reg") - elif tuner == "xgb_knob": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob") - elif tuner == "xgb_itervar": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar") - elif tuner == "xgb_curve": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve") - elif tuner == "xgb_rank": - tuner_obj = XGBTuner(tsk, loss_type="rank") - elif tuner == "xgb_rank_knob": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob") - elif tuner == "xgb_rank_itervar": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar") - elif tuner == "xgb_rank_curve": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve") - elif tuner == "xgb_rank_binary": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary") - elif tuner == "xgb_rank_binary_knob": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob") - elif tuner == "xgb_rank_binary_itervar": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar") - elif tuner == "xgb_rank_binary_curve": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve") - elif tuner == "ga": - tuner_obj = GATuner(tsk, pop_size=100) - elif tuner == "random": - tuner_obj = RandomTuner(tsk) - elif tuner == "gridsearch": - tuner_obj = GridSearchTuner(tsk) - else: - raise ValueError("Invalid tuner: " + tuner) - - if use_transfer_learning: - if os.path.isfile(tmp_log_file): - tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) - - # do tuning - tsk_trial = min(n_trial, len(tsk.config_space)) - tuner_obj.tune( - n_trial=tsk_trial, - early_stopping=early_stopping, - measure_option=measure_option, - callbacks=[ - autotvm.callback.progress_bar(tsk_trial, prefix=prefix), - autotvm.callback.log_to_file(tmp_log_file), - ], - ) - - # pick best records to a cache file - autotvm.record.pick_best(tmp_log_file, log_filename) - os.remove(tmp_log_file) - - -######################################################################## -# Finally, we launch tuning jobs and evaluate the end-to-end performance. - - -def tune_and_evaluate(tuning_opt): - # extract workloads from relay program - print("Extract tasks...") - mod, params, input_shape, out_shape = get_network(network, batch_size=1) - tasks = autotvm.task.extract_from_program( - mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),) - ) - - # run tuning tasks - print("Tuning...") - tune_tasks(tasks, **tuning_opt) - - # compile kernels with history best records - with autotvm.apply_history_best(log_file): - print("Compile...") - with tvm.transform.PassContext(opt_level=3): - lib = relay.build_module.build(mod, target=target, params=params) - - # load parameters - dev = tvm.device(str(target), 0) - module = runtime.GraphModule(lib["default"](dev)) - data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) - module.set_input("data", data_tvm) - - # evaluate - print("Evaluate inference time cost...") - print(module.benchmark(dev, number=1, repeat=600)) - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. - -# tune_and_evaluate(tuning_option) - -###################################################################### -# Sample Output -# ------------- -# The tuning needs to compile many programs and extract feature from them. -# So a high performance CPU is recommended. One sample output is listed below. -# It takes about 4 hours to get the following output on a 32T AMD Ryzen Threadripper. -# The tuning target is NVIDIA 1080 Ti. -# (You can see some errors during compilation. If the tuning is not stuck, it is okay.) -# -# .. code-block:: bash -# -# Extract tasks... -# Tuning... -# [Task 1/12] Current/Best: 541.83/3570.66 GFLOPS | Progress: (960/2000) | 1001.31 s Done. -# [Task 2/12] Current/Best: 0.56/ 803.33 GFLOPS | Progress: (704/2000) | 608.08 s Done. -# [Task 3/12] Current/Best: 103.69/1141.25 GFLOPS | Progress: (768/2000) | 702.13 s Done. -# [Task 4/12] Current/Best: 2905.03/3925.15 GFLOPS | Progress: (864/2000) | 745.94 sterminate called without an active exception -# [Task 4/12] Current/Best: 2789.36/3925.15 GFLOPS | Progress: (1056/2000) | 929.40 s Done. -# [Task 5/12] Current/Best: 89.06/1076.24 GFLOPS | Progress: (704/2000) | 601.73 s Done. -# [Task 6/12] Current/Best: 40.39/2129.02 GFLOPS | Progress: (1088/2000) | 1125.76 s Done. -# [Task 7/12] Current/Best: 4090.53/5007.02 GFLOPS | Progress: (800/2000) | 903.90 s Done. -# [Task 8/12] Current/Best: 4.78/1272.28 GFLOPS | Progress: (768/2000) | 749.14 s Done. -# [Task 9/12] Current/Best: 1391.45/2325.08 GFLOPS | Progress: (992/2000) | 1084.87 s Done. -# [Task 10/12] Current/Best: 1995.44/2383.59 GFLOPS | Progress: (864/2000) | 862.60 s Done. -# [Task 11/12] Current/Best: 4093.94/4899.80 GFLOPS | Progress: (224/2000) | 240.92 sterminate called without an active exception -# [Task 11/12] Current/Best: 3487.98/4909.91 GFLOPS | Progress: (480/2000) | 534.96 sterminate called without an active exception -# [Task 11/12] Current/Best: 4636.84/4912.17 GFLOPS | Progress: (1184/2000) | 1381.16 sterminate called without an active exception -# [Task 11/12] Current/Best: 50.12/4912.17 GFLOPS | Progress: (1344/2000) | 1602.81 s Done. -# [Task 12/12] Current/Best: 3581.31/4286.30 GFLOPS | Progress: (736/2000) | 943.52 s Done. -# Compile... -# Evaluate inference time cost... -# Mean inference time (std dev): 1.07 ms (0.05 ms) -# -# As a reference baseline, the time cost of MXNet + TensorRT on resnet-18 is 1.30ms. So we are a little faster. - -###################################################################### -# -# .. note:: **Experiencing Difficulties?** -# -# The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS", -# then there must be something wrong. -# -# First, make sure you set the correct configuration of your device. -# Then, you can print debug information by adding these lines in the beginning -# of the script. It will print every measurement result, where you can find useful -# error messages. -# -# .. code-block:: python -# -# import logging -# logging.getLogger('autotvm').setLevel(logging.DEBUG) -# -# Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org - -################################################################# -# .. _tutorials-autotvm-scale-up-rpc-tracker: - -################################################################# -# Scale up measurement by using multiple devices -# ---------------------------------------------- -# If you have multiple devices, you can use all of them for measurement. -# TVM uses the RPC Tracker to manage distributed devices. -# The RPC Tracker is a centralized controller node. We can register all devices to -# the tracker. For example, if we have 10 GPU cards, we can register all of them -# to the tracker, and run 10 measurements in parallel, accelerating the tuning process. -# -# To start an RPC tracker, run this command on the host machine. The tracker is -# required during the whole tuning process, so we need to open a new terminal for -# this command: -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190 -# -# The expected output is -# -# .. code-block:: bash -# -# INFO:RPCTracker:bind to 0.0.0.0:9190 -# -# Then open another new terminal for the RPC server. We need to start one dedicated server -# for each device. We use a string key to distinguish the types of devices. -# You can pick a name you like. -# (Note: For rocm backend, there are some internal errors with the compiler, -# we need to add `--no-fork` to the argument list.) -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_server --tracker=127.0.0.1:9190 --key=1080ti -# -# After registering devices, we can confirm it by querying rpc_tracker -# -# .. code-block:: bash -# -# python -m tvm.exec.query_rpc_tracker --host=127.0.0.1 --port=9190 -# -# For example, if we have four 1080ti, two titanx and one gfx900, the output can be -# -# .. code-block:: bash -# -# Queue Status -# ---------------------------------- -# key total free pending -# ---------------------------------- -# 1080ti 4 4 0 -# titanx 2 2 0 -# gfx900 1 1 0 -# ---------------------------------- -# -# Finally, we need to change the tuning option to use RPCRunner. Use the code below -# to replace the corresponding part above. - -tuning_option = { - "log_filename": log_file, - "tuner": "xgb", - "n_trial": 2000, - "early_stopping": 600, - "measure_option": autotvm.measure_option( - builder=autotvm.LocalBuilder(timeout=10), - runner=autotvm.RPCRunner( - "1080ti", # change the device key to your key - "127.0.0.1", - 9190, - number=20, - repeat=3, - timeout=4, - min_repeat_ms=150, - ), - ), -} diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py b/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py deleted file mode 100644 index d5b4b217abc0..000000000000 --- a/gallery/how_to/tune_with_autotvm/tune_relay_mobile_gpu.py +++ /dev/null @@ -1,426 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-tuning a Convolutional Network for Mobile GPU -================================================== -**Author**: `Lianmin Zheng `_, `Eddie Yan `_ - -Auto-tuning for a specific device is critical for getting the best -performance. This is a tutorial about how to tune a whole convolutional -network. - -The operator implementation for Mobile GPU in TVM is written in template form. -The template has many tunable knobs (tile factor, vectorization, unrolling, etc). -We will tune all convolution, depthwise convolution and dense operators -in the neural network. After tuning, we produce a log file which stores -the best knob values for all required operators. When the TVM compiler compiles -these operators, it will query this log file to get the best knob values. - -We also released pre-tuned parameters for some arm devices. You can go to -`Mobile GPU Benchmark `_ -to see the results. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -###################################################################### -# Install dependencies -# -------------------- -# To use the autotvm package in tvm, we need to install some extra dependencies. -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost tornado cloudpickle -# -# To make TVM run faster during tuning, it is recommended to use cython -# as FFI of tvm. In the root directory of tvm, execute -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Import packages. - - -import os - -import numpy as np - -import tvm -from tvm import relay, autotvm -import tvm.relay.testing -from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner -from tvm.contrib.utils import tempdir -import tvm.contrib.graph_executor as runtime - -################################################################# -# Define network -# -------------- -# First we need to define the network in relay frontend API. -# We can load some pre-defined network from :code:`relay.testing`. -# We can also load models from MXNet, ONNX and TensorFlow. - - -def get_network(name, batch_size): - """Get the symbol definition and random weight of a network""" - input_shape = (batch_size, 3, 224, 224) - output_shape = (batch_size, 1000) - - if "resnet" in name: - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, batch_size=batch_size, dtype=dtype - ) - elif "vgg" in name: - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.vgg.get_workload( - num_layers=n_layer, batch_size=batch_size, dtype=dtype - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype) - elif name == "squeezenet_v1.1": - mod, params = relay.testing.squeezenet.get_workload( - batch_size=batch_size, version="1.1", dtype=dtype - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - else: - raise ValueError("Unsupported network: " + name) - - return mod, params, input_shape, output_shape - - -################################################################# -# .. _tutorials-autotvm-start-rpc-tracker: - -################################################################# -# Start RPC Tracker -# ----------------- -# TVM uses RPC session to communicate with ARM boards. -# During tuning, the tuner will send the generated code to the board and -# measure the speed of code on the board. -# -# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices. -# The RPC Tracker is a centralized controller node. We can register all devices to -# the tracker. For example, if we have 10 phones, we can register all of them -# to the tracker, and run 10 measurements in parallel, accelerating the tuning process. -# -# To start an RPC tracker, run this command on the host machine. The tracker is -# required during the whole tuning process, so we need to open a new terminal for -# this command: -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190 -# -# The expected output is -# -# .. code-block:: bash -# -# INFO:RPCTracker:bind to 0.0.0.0:9190 - -################################################################# -# Register Devices to RPC Tracker -# ----------------------------------- -# Now we can register our devices to the tracker. The first step is to -# build the TVM runtime for the ARM devices. -# -# * For Linux: -# Follow this section :ref:`build-tvm-runtime-on-device` to build -# the TVM runtime on the device. Then register the device to tracker by -# -# .. code-block:: bash -# -# python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rk3399 -# -# (replace :code:`[HOST_IP]` with the IP address of your host machine) -# -# * For Android: -# Follow this `readme page `_ to -# install TVM RPC APK on the android device. Make sure you can pass the android RPC test. -# Then you have already registered your device. During tuning, you have to go to developer option -# and enable "Keep screen awake during changing" and charge your phone to make it stable. -# -# After registering devices, we can confirm it by querying rpc_tracker -# -# .. code-block:: bash -# -# python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 -# -# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 3B and 2 rk3399, -# the output can be -# -# .. code-block:: bash -# -# Queue Status -# ---------------------------------- -# key total free pending -# ---------------------------------- -# mate10pro 2 2 0 -# rk3399 2 2 0 -# rpi3b 11 11 0 -# ---------------------------------- -# -# You can register multiple devices to the tracker to accelerate the measurement in tuning. - -########################################### -# Set Tuning Options -# ------------------ -# Before tuning, we should apply some configurations. Here I use an RK3399 board -# as example. In your setting, you should modify the target and device_key accordingly. -# set :code:`use_android` to True if you use android phone. - -#### DEVICE CONFIG #### -# Replace "aarch64-linux-gnu" with the correct target of your board. -# This target host is used for cross compilation. You can query it by :code:`gcc -v` on your device. -target = tvm.target.Target("opencl -device=mali", host="llvm -mtriple=aarch64-linux-gnu") - -# Also replace this with the device key in your tracker -device_key = "rk3399" - -# Set this to True if you use android phone -use_android = False - -#### TUNING OPTION #### -network = "resnet-18" -log_file = "%s.%s.log" % (device_key, network) -dtype = "float32" - -tuning_option = { - "log_filename": log_file, - "tuner": "xgb", - "n_trial": 1000, - "early_stopping": 450, - "measure_option": autotvm.measure_option( - builder=autotvm.LocalBuilder(build_func="ndk" if use_android else "default"), - runner=autotvm.RPCRunner( - device_key, - host="127.0.0.1", - port=9190, - number=10, - timeout=5, - ), - ), -} - -#################################################################### -# -# .. note:: How to set tuning options -# -# In general, the default values provided here work well. -# If you have enough time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, -# which makes the tuning run longer. -# If your device runs very slow or your conv2d operators have many GFLOPs, considering to -# set timeout larger. -# - -################################################################### -# Begin Tuning -# ------------ -# Now we can extract tuning tasks from the network and begin tuning. -# Here, we provide a simple utility function to tune a list of tasks. -# This function is just an initial implementation which tunes them in sequential order. -# We will introduce a more sophisticated tuning scheduler in the future. - -# You can skip the implementation of this function for this tutorial. -def tune_tasks( - tasks, - measure_option, - tuner="xgb", - n_trial=1000, - early_stopping=None, - log_filename="tuning.log", - use_transfer_learning=True, -): - # create tmp log file - tmp_log_file = log_filename + ".tmp" - if os.path.exists(tmp_log_file): - os.remove(tmp_log_file) - - for i, tsk in enumerate(reversed(tasks)): - prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) - - # create tuner - if tuner == "xgb": - tuner_obj = XGBTuner(tsk, loss_type="reg") - elif tuner == "xgb_knob": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="knob") - elif tuner == "xgb_itervar": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="itervar") - elif tuner == "xgb_curve": - tuner_obj = XGBTuner(tsk, loss_type="reg", feature_type="curve") - elif tuner == "xgb_rank": - tuner_obj = XGBTuner(tsk, loss_type="rank") - elif tuner == "xgb_rank_knob": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="knob") - elif tuner == "xgb_rank_itervar": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="itervar") - elif tuner == "xgb_rank_curve": - tuner_obj = XGBTuner(tsk, loss_type="rank", feature_type="curve") - elif tuner == "xgb_rank_binary": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary") - elif tuner == "xgb_rank_binary_knob": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="knob") - elif tuner == "xgb_rank_binary_itervar": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="itervar") - elif tuner == "xgb_rank_binary_curve": - tuner_obj = XGBTuner(tsk, loss_type="rank-binary", feature_type="curve") - elif tuner == "ga": - tuner_obj = GATuner(tsk, pop_size=50) - elif tuner == "random": - tuner_obj = RandomTuner(tsk) - elif tuner == "gridsearch": - tuner_obj = GridSearchTuner(tsk) - else: - raise ValueError("Invalid tuner: " + tuner) - - if use_transfer_learning: - if os.path.isfile(tmp_log_file): - tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) - - # do tuning - tsk_trial = min(n_trial, len(tsk.config_space)) - tuner_obj.tune( - n_trial=tsk_trial, - early_stopping=early_stopping, - measure_option=measure_option, - callbacks=[ - autotvm.callback.progress_bar(tsk_trial, prefix=prefix), - autotvm.callback.log_to_file(tmp_log_file), - ], - ) - - # pick best records to a cache file - autotvm.record.pick_best(tmp_log_file, log_filename) - os.remove(tmp_log_file) - - -######################################################################## -# Finally, we launch tuning jobs and evaluate the end-to-end performance. - - -def tune_and_evaluate(tuning_opt): - # extract workloads from relay program - print("Extract tasks...") - mod, params, input_shape, _ = get_network(network, batch_size=1) - tasks = autotvm.task.extract_from_program( - mod["main"], - target=target, - params=params, - ops=(relay.op.get("nn.conv2d"),), - ) - - # run tuning tasks - print("Tuning...") - tune_tasks(tasks, **tuning_opt) - - # compile kernels with history best records - with autotvm.apply_history_best(log_file): - print("Compile...") - with tvm.transform.PassContext(opt_level=3): - lib = relay.build_module.build(mod, target=target, params=params) - # export library - tmp = tempdir() - if use_android: - from tvm.contrib import ndk - - filename = "net.so" - lib.export_library(tmp.relpath(filename), fcompile=ndk.create_shared) - else: - filename = "net.tar" - lib.export_library(tmp.relpath(filename)) - - # upload module to device - print("Upload...") - remote = autotvm.measure.request_remote(device_key, "127.0.0.1", 9190, timeout=10000) - remote.upload(tmp.relpath(filename)) - rlib = remote.load_module(filename) - - # upload parameters to device - dev = remote.device(str(target), 0) - module = runtime.GraphModule(rlib["default"](dev)) - data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) - module.set_input("data", data_tvm) - - # evaluate - print("Evaluate inference time cost...") - print(module.benchmark(dev, number=1, repeat=30)) - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. - -# tune_and_evaluate(tuning_option) - -###################################################################### -# Sample Output -# ------------- -# The tuning needs to compile many programs and extract feature from them. -# So a high performance CPU is recommended. -# One sample output is listed below. It takes about 3 hours on a 32T AMD Ryzen Threadripper. -# -# .. code-block:: bash -# -# Extract tasks... -# Tuning... -# [Task 1/17] Current/Best: 25.30/ 39.12 GFLOPS | Progress: (992/1000) | 751.22 s Done. -# [Task 2/17] Current/Best: 40.70/ 45.50 GFLOPS | Progress: (736/1000) | 545.46 s Done. -# [Task 3/17] Current/Best: 38.83/ 42.35 GFLOPS | Progress: (992/1000) | 1549.85 s Done. -# [Task 4/17] Current/Best: 23.31/ 31.02 GFLOPS | Progress: (640/1000) | 1059.31 s Done. -# [Task 5/17] Current/Best: 0.06/ 2.34 GFLOPS | Progress: (544/1000) | 305.45 s Done. -# [Task 6/17] Current/Best: 10.97/ 17.20 GFLOPS | Progress: (992/1000) | 1050.00 s Done. -# [Task 7/17] Current/Best: 8.98/ 10.94 GFLOPS | Progress: (928/1000) | 421.36 s Done. -# [Task 8/17] Current/Best: 4.48/ 14.86 GFLOPS | Progress: (704/1000) | 582.60 s Done. -# [Task 9/17] Current/Best: 10.30/ 25.99 GFLOPS | Progress: (864/1000) | 899.85 s Done. -# [Task 10/17] Current/Best: 11.73/ 12.52 GFLOPS | Progress: (608/1000) | 304.85 s Done. -# [Task 11/17] Current/Best: 15.26/ 18.68 GFLOPS | Progress: (800/1000) | 747.52 s Done. -# [Task 12/17] Current/Best: 17.48/ 26.71 GFLOPS | Progress: (1000/1000) | 1166.40 s Done. -# [Task 13/17] Current/Best: 0.96/ 11.43 GFLOPS | Progress: (960/1000) | 611.65 s Done. -# [Task 14/17] Current/Best: 17.88/ 20.22 GFLOPS | Progress: (672/1000) | 670.29 s Done. -# [Task 15/17] Current/Best: 11.62/ 13.98 GFLOPS | Progress: (736/1000) | 449.25 s Done. -# [Task 16/17] Current/Best: 19.90/ 23.83 GFLOPS | Progress: (608/1000) | 708.64 s Done. -# [Task 17/17] Current/Best: 17.98/ 22.75 GFLOPS | Progress: (736/1000) | 1122.60 s Done. -# Compile... -# Upload... -# Evaluate inference time cost... -# Mean inference time (std dev): 128.05 ms (7.74 ms) -# - -###################################################################### -# -# .. note:: **Experiencing Difficulties?** -# -# The auto tuning module is error-prone. If you always see " 0.00/ 0.00 GFLOPS", -# then there must be something wrong. -# -# First, make sure you set the correct configuration of your device. -# Then, you can print debug information by adding these lines in the beginning -# of the script. It will print every measurement result, where you can find useful -# error messages. -# -# .. code-block:: python -# -# import logging -# logging.getLogger('autotvm').setLevel(logging.DEBUG) -# -# Finally, always feel free to ask our community for help on https://discuss.tvm.apache.org diff --git a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py b/gallery/how_to/tune_with_autotvm/tune_relay_x86.py deleted file mode 100644 index b56ec0ad0e2f..000000000000 --- a/gallery/how_to/tune_with_autotvm/tune_relay_x86.py +++ /dev/null @@ -1,310 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tune_relay_x86: - -Auto-tuning a Convolutional Network for x86 CPU -=============================================== -**Author**: `Yao Wang `_, `Eddie Yan `_ - -This is a tutorial about how to tune convolution neural network -for x86 CPU. - -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. -""" - -import os -import numpy as np - -import tvm -from tvm import relay, autotvm -from tvm.relay import testing -from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner -from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner -import tvm.contrib.graph_executor as runtime - -################################################################# -# Define network -# -------------- -# First we need to define the network in relay frontend API. -# We can either load some pre-defined network from :code:`relay.testing` -# or building :any:`relay.testing.resnet` with relay. -# We can also load models from MXNet, ONNX and TensorFlow. -# -# In this tutorial, we choose resnet-18 as tuning example. - - -def get_network(name, batch_size): - """Get the symbol definition and random weight of a network""" - input_shape = (batch_size, 3, 224, 224) - output_shape = (batch_size, 1000) - - if "resnet" in name: - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.resnet.get_workload( - num_layers=n_layer, batch_size=batch_size, dtype=dtype - ) - elif "vgg" in name: - n_layer = int(name.split("-")[1]) - mod, params = relay.testing.vgg.get_workload( - num_layers=n_layer, batch_size=batch_size, dtype=dtype - ) - elif name == "mobilenet": - mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, dtype=dtype) - elif name == "squeezenet_v1.1": - mod, params = relay.testing.squeezenet.get_workload( - batch_size=batch_size, version="1.1", dtype=dtype - ) - elif name == "inception_v3": - input_shape = (batch_size, 3, 299, 299) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - else: - raise ValueError("Unsupported network: " + name) - - return mod, params, input_shape, output_shape - - -# Replace "llvm" with the correct target of your CPU. -# For example, for AWS EC2 c5 instance with Intel Xeon -# Platinum 8000 series, the target should be "llvm -mcpu=skylake-avx512". -# For AWS EC2 c4 instance with Intel Xeon E5-2666 v3, it should be -# "llvm -mcpu=core-avx2". -target = "llvm" - -batch_size = 1 -dtype = "float32" -model_name = "resnet-18" -log_file = "%s.log" % model_name -graph_opt_sch_file = "%s_graph_opt.log" % model_name - -# Set the input name of the graph -# For ONNX models, it is typically "0". -input_name = "data" - -# Set number of threads used for tuning based on the number of -# physical CPU cores on your machine. -num_threads = 1 -os.environ["TVM_NUM_THREADS"] = str(num_threads) - - -################################################################# -# Configure tensor tuning settings and create tasks -# ------------------------------------------------- -# To get better kernel execution performance on x86 CPU, -# we need to change data layout of convolution kernel from -# "NCHW" to "NCHWc". To deal with this situation, we define -# conv2d_NCHWc operator in topi. We will tune this operator -# instead of plain conv2d. -# -# We will use local mode for tuning configuration. RPC tracker -# mode can be setup similarly to the approach in -# :ref:`tune_relay_arm` tutorial. -# -# To perform a precise measurement, we should repeat the measurement several -# times and use the average of results. In addition, we need to flush the cache -# for the weight tensors between repeated measurements. This can make the measured -# latency of one operator closer to its actual latency during end-to-end inference. - -tuning_option = { - "log_filename": log_file, - "tuner": "random", - "early_stopping": None, - "measure_option": autotvm.measure_option( - builder=autotvm.LocalBuilder(), - runner=autotvm.LocalRunner( - number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True - ), - ), -} - - -# You can skip the implementation of this function for this tutorial. -def tune_kernels( - tasks, measure_option, tuner="gridsearch", early_stopping=None, log_filename="tuning.log" -): - - for i, task in enumerate(tasks): - prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) - - # create tuner - if tuner == "xgb": - tuner_obj = XGBTuner(task, loss_type="reg") - elif tuner == "xgb_knob": - tuner_obj = XGBTuner(task, loss_type="reg", feature_type="knob") - elif tuner == "xgb_itervar": - tuner_obj = XGBTuner(task, loss_type="reg", feature_type="itervar") - elif tuner == "xgb_curve": - tuner_obj = XGBTuner(task, loss_type="reg", feature_type="curve") - elif tuner == "xgb_rank": - tuner_obj = XGBTuner(task, loss_type="rank") - elif tuner == "xgb_rank_knob": - tuner_obj = XGBTuner(task, loss_type="rank", feature_type="knob") - elif tuner == "xgb_rank_itervar": - tuner_obj = XGBTuner(task, loss_type="rank", feature_type="itervar") - elif tuner == "xgb_rank_curve": - tuner_obj = XGBTuner(task, loss_type="rank", feature_type="curve") - elif tuner == "xgb_rank_binary": - tuner_obj = XGBTuner(task, loss_type="rank-binary") - elif tuner == "xgb_rank_binary_knob": - tuner_obj = XGBTuner(task, loss_type="rank-binary", feature_type="knob") - elif tuner == "xgb_rank_binary_itervar": - tuner_obj = XGBTuner(task, loss_type="rank-binary", feature_type="itervar") - elif tuner == "xgb_rank_binary_curve": - tuner_obj = XGBTuner(task, loss_type="rank-binary", feature_type="curve") - elif tuner == "ga": - tuner_obj = GATuner(task, pop_size=50) - elif tuner == "random": - tuner_obj = RandomTuner(task) - elif tuner == "gridsearch": - tuner_obj = GridSearchTuner(task) - else: - raise ValueError("Invalid tuner: " + tuner) - - # do tuning - n_trial = len(task.config_space) - tuner_obj.tune( - n_trial=n_trial, - early_stopping=early_stopping, - measure_option=measure_option, - callbacks=[ - autotvm.callback.progress_bar(n_trial, prefix=prefix), - autotvm.callback.log_to_file(log_filename), - ], - ) - - -# Use graph tuner to achieve graph level optimal schedules -# Set use_DP=False if it takes too long to finish. -def tune_graph(graph, dshape, records, opt_sch_file, use_DP=True): - target_op = [ - relay.op.get("nn.conv2d"), - ] - Tuner = DPTuner if use_DP else PBQPTuner - executor = Tuner(graph, {input_name: dshape}, records, target_op, target) - executor.benchmark_layout_transform(min_exec_num=2000) - executor.run() - executor.write_opt_sch2record_file(opt_sch_file) - - -######################################################################## -# Finally, we launch tuning jobs and evaluate the end-to-end performance. - - -def evaluate_performance(lib, data_shape): - # upload parameters to device - dev = tvm.cpu() - data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype)) - module = runtime.GraphModule(lib["default"](dev)) - module.set_input(input_name, data_tvm) - - # evaluate - print("Evaluate inference time cost...") - print(module.benchmark(dev, number=100, repeat=3)) - - -def tune_and_evaluate(tuning_opt): - # extract workloads from relay program - print("Extract tasks...") - mod, params, data_shape, out_shape = get_network(model_name, batch_size) - tasks = autotvm.task.extract_from_program( - mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),) - ) - - # run tuning tasks - tune_kernels(tasks, **tuning_opt) - tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file) - - # compile kernels in default mode - print("Evaluation of the network compiled in 'default' mode without auto tune:") - with tvm.transform.PassContext(opt_level=3): - print("Compile...") - lib = relay.build(mod, target=target, params=params) - evaluate_performance(lib, data_shape) - - # compile kernels in kernel tuned only mode - print("\nEvaluation of the network been tuned on kernel level:") - with autotvm.apply_history_best(log_file): - print("Compile...") - with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) - evaluate_performance(lib, data_shape) - - # compile kernels with graph-level best records - print("\nEvaluation of the network been tuned on graph level:") - with autotvm.apply_graph_best(graph_opt_sch_file): - print("Compile...") - with tvm.transform.PassContext(opt_level=3): - lib = relay.build_module.build(mod, target=target, params=params) - evaluate_performance(lib, data_shape) - - -# We do not run the tuning in our webpage server since it takes too long. -# Uncomment the following line to run it by yourself. - -# tune_and_evaluate(tuning_option) - -###################################################################### -# Sample Output -# ------------- -# The tuning needs to compile many programs and extract feature from them. -# So a high performance CPU is recommended. -# One sample output is listed below. -# -# .. code-block:: bash -# -# Extract tasks... -# Tuning... -# [Task 1/12] Current/Best: 598.05/2497.63 GFLOPS | Progress: (252/252) | 1357.95 s Done. -# [Task 2/12] Current/Best: 522.63/2279.24 GFLOPS | Progress: (784/784) | 3989.60 s Done. -# [Task 3/12] Current/Best: 447.33/1927.69 GFLOPS | Progress: (784/784) | 3869.14 s Done. -# [Task 4/12] Current/Best: 481.11/1912.34 GFLOPS | Progress: (672/672) | 3274.25 s Done. -# [Task 5/12] Current/Best: 414.09/1598.45 GFLOPS | Progress: (672/672) | 2720.78 s Done. -# [Task 6/12] Current/Best: 508.96/2273.20 GFLOPS | Progress: (768/768) | 3718.75 s Done. -# [Task 7/12] Current/Best: 469.14/1955.79 GFLOPS | Progress: (576/576) | 2665.67 s Done. -# [Task 8/12] Current/Best: 230.91/1658.97 GFLOPS | Progress: (576/576) | 2435.01 s Done. -# [Task 9/12] Current/Best: 487.75/2295.19 GFLOPS | Progress: (648/648) | 3009.95 s Done. -# [Task 10/12] Current/Best: 182.33/1734.45 GFLOPS | Progress: (360/360) | 1755.06 s Done. -# [Task 11/12] Current/Best: 372.18/1745.15 GFLOPS | Progress: (360/360) | 1684.50 s Done. -# [Task 12/12] Current/Best: 215.34/2271.11 GFLOPS | Progress: (400/400) | 2128.74 s Done. -# INFO Start to benchmark layout transformation... -# INFO Benchmarking layout transformation successful. -# INFO Start to run dynamic programming algorithm... -# INFO Start forward pass... -# INFO Finished forward pass. -# INFO Start backward pass... -# INFO Finished backward pass... -# INFO Finished DPExecutor run. -# INFO Writing optimal schedules to resnet-18_graph_opt.log successfully. -# -# Evaluation of the network compiled in 'default' mode without auto tune: -# Compile... -# Evaluate inference time cost... -# Mean inference time (std dev): 4.5 ms (0.03 ms) -# -# Evaluation of the network been tuned on kernel level: -# Compile... -# Evaluate inference time cost... -# Mean inference time (std dev): 3.2 ms (0.03 ms) -# -# Evaluation of the network been tuned on graph level: -# Compile... -# Config for target=llvm -keys=cpu, workload=('dense_nopack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32') is missing in ApplyGraphBest context. A fallback configuration is used, which may bring great performance regression. -# Config for target=llvm -keys=cpu, workload=('dense_pack.x86', ('TENSOR', (1, 512), 'float32'), ('TENSOR', (1000, 512), 'float32'), None, 'float32') is missing in ApplyGraphBest context. A fallback configuration is used, which may bring great performance regression. -# Evaluate inference time cost... -# Mean inference time (std dev): 3.16 ms (0.03 ms) diff --git a/gallery/how_to/work_with_msc/_resnet.py b/gallery/how_to/work_with_msc/_resnet.py deleted file mode 100644 index d05172337638..000000000000 --- a/gallery/how_to/work_with_msc/_resnet.py +++ /dev/null @@ -1,350 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# build resnet for cifar10, debug use only -# from https://github.com/huyvnphan/PyTorch_CIFAR10/blob/master/cifar10_models/resnet.py - -import os -import requests -from tqdm import tqdm -import zipfile - -import torch -import torch.nn as nn - -__all__ = [ - "ResNet", - "resnet18", - "resnet34", - "resnet50", -] - - -def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1): - """3x3 convolution with padding""" - return nn.Conv2d( - in_planes, - out_planes, - kernel_size=3, - stride=stride, - padding=dilation, - groups=groups, - bias=False, - dilation=dilation, - ) - - -def conv1x1(in_planes, out_planes, stride=1): - """1x1 convolution""" - return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) - - -class BasicBlock(nn.Module): - expansion = 1 - - def __init__( - self, - inplanes, - planes, - stride=1, - downsample=None, - groups=1, - base_width=64, - dilation=1, - norm_layer=None, - ): - super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - if groups != 1 or base_width != 64: - raise ValueError("BasicBlock only supports groups=1 and base_width=64") - if dilation > 1: - raise NotImplementedError("Dilation > 1 not supported in BasicBlock") - # Both self.conv1 and self.downsample layers downsample the input when stride != 1 - self.conv1 = conv3x3(inplanes, planes, stride) - self.bn1 = norm_layer(planes) - self.relu = nn.ReLU(inplace=True) - self.conv2 = conv3x3(planes, planes) - self.bn2 = norm_layer(planes) - self.downsample = downsample - self.stride = stride - - def forward(self, x): - identity = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - out = self.relu(out) - - return out - - -class Bottleneck(nn.Module): - expansion = 4 - - def __init__( - self, - inplanes, - planes, - stride=1, - downsample=None, - groups=1, - base_width=64, - dilation=1, - norm_layer=None, - ): - super(Bottleneck, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - width = int(planes * (base_width / 64.0)) * groups - # Both self.conv2 and self.downsample layers downsample the input when stride != 1 - self.conv1 = conv1x1(inplanes, width) - self.bn1 = norm_layer(width) - self.conv2 = conv3x3(width, width, stride, groups, dilation) - self.bn2 = norm_layer(width) - self.conv3 = conv1x1(width, planes * self.expansion) - self.bn3 = norm_layer(planes * self.expansion) - self.relu = nn.ReLU(inplace=True) - self.downsample = downsample - self.stride = stride - - def forward(self, x): - identity = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - out = self.relu(out) - - out = self.conv3(out) - out = self.bn3(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - out = self.relu(out) - - return out - - -class ResNet(nn.Module): - def __init__( - self, - block, - layers, - num_classes=10, - zero_init_residual=False, - groups=1, - width_per_group=64, - replace_stride_with_dilation=None, - norm_layer=None, - ): - super(ResNet, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2d - self._norm_layer = norm_layer - - self.inplanes = 64 - self.dilation = 1 - if replace_stride_with_dilation is None: - # each element in the tuple indicates if we should replace - # the 2x2 stride with a dilated convolution instead - replace_stride_with_dilation = [False, False, False] - if len(replace_stride_with_dilation) != 3: - raise ValueError( - "replace_stride_with_dilation should be None " - "or a 3-element tuple, got {}".format(replace_stride_with_dilation) - ) - self.groups = groups - self.base_width = width_per_group - - # CIFAR10: kernel_size 7 -> 3, stride 2 -> 1, padding 3->1 - self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False) - # END - - self.bn1 = norm_layer(self.inplanes) - self.relu = nn.ReLU(inplace=True) - self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) - self.layer1 = self._make_layer(block, 64, layers[0]) - self.layer2 = self._make_layer( - block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0] - ) - self.layer3 = self._make_layer( - block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1] - ) - self.layer4 = self._make_layer( - block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2] - ) - self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - self.fc = nn.Linear(512 * block.expansion, num_classes) - - for m in self.modules(): - if isinstance(m, nn.Conv2d): - nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") - elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): - nn.init.constant_(m.weight, 1) - nn.init.constant_(m.bias, 0) - - # Zero-initialize the last BN in each residual branch, - # so that the residual branch starts with zeros, and each residual block behaves like an identity. - # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 - if zero_init_residual: - for m in self.modules(): - if isinstance(m, Bottleneck): - nn.init.constant_(m.bn3.weight, 0) - elif isinstance(m, BasicBlock): - nn.init.constant_(m.bn2.weight, 0) - - def _make_layer(self, block, planes, blocks, stride=1, dilate=False): - norm_layer = self._norm_layer - downsample = None - previous_dilation = self.dilation - if dilate: - self.dilation *= stride - stride = 1 - if stride != 1 or self.inplanes != planes * block.expansion: - downsample = nn.Sequential( - conv1x1(self.inplanes, planes * block.expansion, stride), - norm_layer(planes * block.expansion), - ) - - layers = [] - layers.append( - block( - self.inplanes, - planes, - stride, - downsample, - self.groups, - self.base_width, - previous_dilation, - norm_layer, - ) - ) - self.inplanes = planes * block.expansion - for _ in range(1, blocks): - layers.append( - block( - self.inplanes, - planes, - groups=self.groups, - base_width=self.base_width, - dilation=self.dilation, - norm_layer=norm_layer, - ) - ) - - return nn.Sequential(*layers) - - def forward(self, x): - x = self.conv1(x) - x = self.bn1(x) - x = self.relu(x) - x = self.maxpool(x) - - x = self.layer1(x) - x = self.layer2(x) - x = self.layer3(x) - x = self.layer4(x) - - x = self.avgpool(x) - x = x.reshape(x.size(0), -1) - x = self.fc(x) - - return x - - -def _resnet(arch, block, layers, pretrained, progress, device, **kwargs): - model = ResNet(block, layers, **kwargs) - if pretrained: - if os.path.isdir(pretrained): - state_dict = torch.load(pretrained + "/" + arch + ".pt", map_location=device) - else: - script_dir = os.path.dirname(__file__) - state_dict = torch.load( - script_dir + "/state_dicts/" + arch + ".pt", map_location=device - ) - model.load_state_dict(state_dict) - return model - - -def resnet18(pretrained=False, progress=True, device="cpu", **kwargs): - """Constructs a ResNet-18 model. - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - return _resnet("resnet18", BasicBlock, [2, 2, 2, 2], pretrained, progress, device, **kwargs) - - -def resnet34(pretrained=False, progress=True, device="cpu", **kwargs): - """Constructs a ResNet-34 model. - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - return _resnet("resnet34", BasicBlock, [3, 4, 6, 3], pretrained, progress, device, **kwargs) - - -def resnet50(pretrained=False, progress=True, device="cpu", **kwargs): - """Constructs a ResNet-50 model. - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - return _resnet("resnet50", Bottleneck, [3, 4, 6, 3], pretrained, progress, device, **kwargs) - - -def download_weights(): - url = "https://rutgers.box.com/shared/static/gkw08ecs797j2et1ksmbg1w5t3idf5r5.zip" - - # Streaming, so we can iterate over the response. - r = requests.get(url, stream=True) - - # Total size in Mebibyte - total_size = int(r.headers.get("content-length", 0)) - block_size = 2**20 # Mebibyte - t = tqdm(total=total_size, unit="MiB", unit_scale=True) - - with open("state_dicts.zip", "wb") as f: - for data in r.iter_content(block_size): - t.update(len(data)) - f.write(data) - t.close() - - if total_size != 0 and t.n != total_size: - raise Exception("Error, something went wrong") - - print("Download successful. Unzipping file...") - path_to_zip_file = os.path.join(os.getcwd(), "state_dicts.zip") - directory_to_extract_to = os.path.join(os.getcwd(), "cifar10_models") - with zipfile.ZipFile(path_to_zip_file, "r") as zip_ref: - zip_ref.extractall(directory_to_extract_to) - print("Unzip file successful!") diff --git a/gallery/how_to/work_with_msc/using_tools.py b/gallery/how_to/work_with_msc/using_tools.py deleted file mode 100644 index c8187d218d9b..000000000000 --- a/gallery/how_to/work_with_msc/using_tools.py +++ /dev/null @@ -1,140 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Wrap pytorch model with quantizer. -This example shows how to run PTQ, QAT, PTQ with distill... -Reference for MSC: -https://discuss.tvm.apache.org/t/rfc-unity-msc-introduction-to-multi-system-compiler/15251/5 - -This example use resnet50 from https://github.com/huyvnphan/PyTorch_CIFAR10/tree/master, -please download pt file and copy to args.checkpoint before run example -""" - -import argparse -import torch -import torch.optim as optim - -from tvm.contrib.msc.pipeline import TorchWrapper -from tvm.contrib.msc.core.tools import ToolType -from tvm.contrib.msc.core.utils.message import MSCStage -from _resnet import resnet50 -from utils import * - -parser = argparse.ArgumentParser(description="MSC train && eval example") -parser.add_argument( - "--dataset", - type=str, - default="/tmp/msc_dataset", - help="The folder saving training and testing datas", -) -parser.add_argument( - "--checkpoint", - type=str, - default="/tmp/msc_models", - help="The folder saving training and testing datas", -) -parser.add_argument("--compile_type", type=str, default="tvm", help="The compile type of model") -parser.add_argument("--prune", action="store_true", help="Whether to use pruner") -parser.add_argument("--quantize", action="store_true", help="Whether to use quantizer") -parser.add_argument("--distill", action="store_true", help="Whether to use distiller for tool") -parser.add_argument("--gym", action="store_true", help="Whether to use gym for tool") -parser.add_argument("--test_batch", type=int, default=1, help="The batch size for test") -parser.add_argument("--test_iter", type=int, default=100, help="The iter for test") -parser.add_argument("--calibrate_iter", type=int, default=100, help="The iter for calibration") -parser.add_argument("--train_batch", type=int, default=32, help="The batch size for train") -parser.add_argument("--train_iter", type=int, default=100, help="The iter for train") -parser.add_argument("--train_epoch", type=int, default=5, help="The epoch for train") -parser.add_argument( - "--verbose", type=str, default="info", help="The verbose level, info|debug:1,2,3|critical" -) -parser.add_argument("--dynamic", action="store_true", help="Whether to use dynamic wrapper") -args = parser.parse_args() - - -def get_config(calib_loader, train_loader): - tools, dataset = [], {MSCStage.PREPARE: {"loader": calib_loader}} - if args.prune: - config = {"gym_configs": ["default"]} if args.gym else "default" - tools.append((ToolType.PRUNER, config)) - if args.quantize: - config = {"gym_configs": ["default"]} if args.gym else "default" - tools.append((ToolType.QUANTIZER, config)) - if args.distill: - config = { - "options": { - "optimizer": "adam", - "opt_config": {"lr": 0.00000001, "weight_decay": 0.08}, - } - } - tools.append((ToolType.DISTILLER, config)) - dataset[MSCStage.DISTILL] = {"loader": train_loader} - return TorchWrapper.create_config( - inputs=[("input", [args.test_batch, 3, 32, 32], "float32")], - outputs=["output"], - compile_type=args.compile_type, - dataset=dataset, - tools=tools, - verbose=args.verbose, - dynamic=args.dynamic, - ) - - -if __name__ == "__main__": - trainloader, testloader = get_dataloaders(args.dataset, args.train_batch, args.test_batch) - - def _get_calib_datas(): - for i, (inputs, _) in enumerate(testloader, 0): - if i >= args.calibrate_iter > 0: - break - yield inputs if args.dynamic else {"input": inputs} - - def _get_train_datas(): - for i, (inputs, _) in enumerate(trainloader, 0): - if i >= args.train_iter > 0: - break - yield inputs if args.dynamic else {"input": inputs} - - model = resnet50(pretrained=args.checkpoint) - if torch.cuda.is_available(): - model = model.to(torch.device("cuda:0")) - - acc = eval_model(model, testloader, max_iter=args.test_iter) - print("Baseline acc: " + str(acc)) - - model = TorchWrapper(model, get_config(_get_calib_datas, _get_train_datas)) - - # optimize the model with tool - model.optimize() - acc = eval_model(model, testloader, max_iter=args.test_iter) - print("Optimized acc: " + str(acc)) - - # train the model with tool - optimizer = optim.Adam(model.parameters(), lr=0.0000001, weight_decay=0.08) - for ep in range(args.train_epoch): - train_model(model, trainloader, optimizer, max_iter=args.train_iter) - acc = eval_model(model, testloader, max_iter=args.test_iter) - print("Train[{}] acc: {}".format(ep, acc)) - - # compile the model - model.compile() - acc = eval_model(model, testloader, max_iter=args.test_iter) - print("Compiled acc: " + str(acc)) - - # export the model - path = model.export() - print("Export model to " + str(path)) diff --git a/gallery/how_to/work_with_msc/utils.py b/gallery/how_to/work_with_msc/utils.py deleted file mode 100644 index 3ff20afec6d3..000000000000 --- a/gallery/how_to/work_with_msc/utils.py +++ /dev/null @@ -1,112 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" Utils of using msc examples """ - -import numpy as np - -import torch -from torch import nn -import torchvision -import torchvision.transforms as transforms - - -def get_dataloaders(path, train_batch=32, test_batch=1, dataset="cifar10"): - """Get the data loaders for torch process""" - - if dataset == "cifar10": - mean = (0.4914, 0.4822, 0.4465) - std = (0.2471, 0.2435, 0.2616) - train_transform = transforms.Compose( - [ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize(mean, std), - ] - ) - trainset = torchvision.datasets.CIFAR10( - root=path, train=True, download=True, transform=train_transform - ) - test_transform = transforms.Compose( - [ - transforms.ToTensor(), - transforms.Normalize(mean, std), - ] - ) - testset = torchvision.datasets.CIFAR10( - root=path, train=False, download=True, transform=test_transform - ) - trainloader = torch.utils.data.DataLoader( - trainset, batch_size=train_batch, shuffle=True, num_workers=2 - ) - testloader = torch.utils.data.DataLoader( - testset, batch_size=test_batch, shuffle=False, num_workers=2 - ) - return trainloader, testloader - raise Exception("Unexpected dataset " + str(dataset)) - - -def eval_model(model, dataloader, max_iter=-1, log_step=100): - """Evaluate the model""" - - model.eval() - device = next(model.parameters()).device - num_correct, num_datas = 0, 0 - for i, (inputs, labels) in enumerate(dataloader, 0): - with torch.no_grad(): - outputs = model(inputs.to(device)) - cls_idices = torch.argmax(outputs, axis=1) - labels = labels.to(device) - num_datas += len(cls_idices) - num_correct += torch.where(cls_idices == labels, 1, 0).sum() - if num_datas > 0 and num_datas % log_step == 0: - print("[{}/{}] Torch eval acc: {}".format(i, len(dataloader), num_correct / num_datas)) - if max_iter > 0 and num_datas >= max_iter: - break - acc = num_correct / num_datas - return acc.detach().cpu().numpy().tolist() - - -def train_model(model, dataloader, optimizer, max_iter=-1, log_step=100): - """Train the model""" - - model.train() - device = next(model.parameters()).device - num_correct, num_datas = 0, 0 - criterion = nn.CrossEntropyLoss() - running_loss = 0.0 - for i, (inputs, labels) in enumerate(dataloader, 0): - optimizer.zero_grad() - outputs = model(inputs.to(device)) - cls_idices = torch.argmax(outputs, axis=1) - labels = labels.to(device) - num_datas += len(cls_idices) - num_correct += torch.where(cls_idices == labels, 1, 0).sum() - loss = criterion(outputs, labels) - loss.backward() - optimizer.step() - # gather loss - running_loss += loss.item() - if num_datas > 0 and num_datas % log_step == 0: - print( - "[{}/{}] Torch train loss: {}, acc {}".format( - i, len(dataloader), running_loss / (i + 1), num_correct / num_datas - ) - ) - if max_iter > 0 and num_datas >= max_iter: - break diff --git a/gallery/how_to/work_with_pytorch/using_as_torch.py b/gallery/how_to/work_with_pytorch/using_as_torch.py deleted file mode 100644 index 59c7f88845d9..000000000000 --- a/gallery/how_to/work_with_pytorch/using_as_torch.py +++ /dev/null @@ -1,161 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Wrap Your TVMScript as PyTorch Module -===================================== -**Author**: -`Yaoda Zhou `_ - -This article is a tutorial on wrapping the TVMScript code as the PyTorch module. -Using the decorator `as_torch`, users can wrap TVMScript code into a PyTorch nn.Module naturally. -To follow the tutorial, PyTorch should be installed: - -.. code-block:: bash - - %%shell - pip install torch - -""" - - -# Import PyTorch, as well as necessary libraries -import torch -import torch.nn.functional as F -import torch.utils.benchmark as benchmark - -import tvm -from tvm.contrib.torch import as_torch -from tvm.script import tir as T - -###################################################################### -# Write your own PyTorch operator by TVMScript -# -------------------------------------------- -# PyTorch is a very popular machine learning framework which contains -# optimized implementations of most commonly used operators. -# Nevertheless, sometimes you might want to write your own operators in PyTorch. -# In that case, the performance of such custom operators might not be satisfactory for your needs. -# -# For example, suppose that we are going to define a 1-d depthwise convolution operator. -# Assume the number of in_channel and out_channel are both 70, -# the width is 80 and the kernel size is 20, -# then the 1-d depthwise conv could be written in PyTorch in one line: - -in_channel = 70 -out_channel = 70 -width = 80 -kernel_size = 20 - - -def torch_depthwise(inputs, filters): - return F.conv1d(inputs, filters.view(out_channel, 1, kernel_size), groups=out_channel) - - -# We can run this function as: - -inputs = torch.randn(in_channel, width) -filters = torch.randn(out_channel, kernel_size) -ret_torch = torch_depthwise(inputs, filters) - - -# The `torch_depthwise` function, in a plain Python code, could be written as: - - -def vanilla_depthwise(input, weight): - ret = torch.zeros(out_channel, width - kernel_size + 1) - for j in range(out_channel): - for i in range(width - kernel_size + 1): - for k in range(kernel_size): - ret[j, i] += weight[j, k] * input[j, i + k] - return ret - - -# Then, we plan to optimize the `depthwise` function by leveraging the power of TVM. -# TVM community proposes an embedded Domain Specific Language in Python called TVMScript, -# which serves as the high-level frontend for TVM's Tensor IR. -# The depthwise 1D convolution code above can be translated to TVMScript as follows. -# We provide an `as_torch` decorator, which converts the TVMScript code to PyTorch's nn.Module automatically. - - -@as_torch -@T.prim_func -def tvm_depthwise( - A: T.Buffer((70, 80), "float32"), - B: T.Buffer((70, 20), "float32"), - C: T.Buffer((70, 61), "float32"), -) -> None: - for j, i, k in T.grid(70, 61, 20): - with T.block(): - vi, vj, vk = T.axis.remap("SSR", [i, j, k]) - with T.init(): - C[vj, vi] = T.float32(0) - C[vj, vi] += B[vj, vk] * A[vj, vi + vk] - - -# We can build the TVMScript code by calling the `tune` method in default setting. -# Without providing extra information, the model will be tuned for CPU. - -tvm_depthwise.tune() - -# We can print out the tuned TVMScript code to see how the program is transformed, as - -print(tvm_depthwise.script()) - -# We can verify that the two outputs are the same: - -ret_tvm = torch.zeros(out_channel, width - kernel_size + 1) -tvm_depthwise(inputs, filters, ret_tvm) - -testing.assert_allclose(ret_torch.cpu().numpy(), ret_tvm.cpu().numpy(), atol=1e-5, rtol=1e-5) - - -###################################################################### -# Benchmark -# --------- - -results = [] -for i in range(5): - inputs = torch.randn(out_channel, width) - filters = torch.randn(out_channel, kernel_size) - res = torch.zeros(out_channel, width - kernel_size + 1) - sub_label = f"[test {i}]" - results.append( - benchmark.Timer( - stmt="tvm_depthwise(inputs, filters, res)", - setup="from __main__ import tvm_depthwise", - globals={"inputs": inputs, "filters": filters, "res": res}, - sub_label=sub_label, - description="TVMScript", - ).blocked_autorange() - ) - results.append( - benchmark.Timer( - stmt="torch_depthwise(inputs, filters)", - setup="from __main__ import torch_depthwise", - globals={ - "inputs": inputs, - "filters": filters, - }, - sub_label=sub_label, - description="PyTorch", - ).blocked_autorange() - ) -compare = benchmark.Compare(results) -compare.print() - -# In author's environment, the average inference time of `tvm_depthwise` is 120.0 us, -# while the average inference time of `torch_depthwise` is 196.0 us (PyTorch version is 1.11.0), -# showing the speedup of around 38%. diff --git a/gallery/how_to/work_with_pytorch/using_optimized_torch.py b/gallery/how_to/work_with_pytorch/using_optimized_torch.py deleted file mode 100644 index 0feafad7c3c3..000000000000 --- a/gallery/how_to/work_with_pytorch/using_optimized_torch.py +++ /dev/null @@ -1,153 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compile PyTorch Models -====================== -**Author**: -`Yaoda Zhou `_ - -This article is a tutorial to optimize PyTorch models by using decorator `optimize_torch`. -To follow this tutorial, PyTorch, as well as TorchVision, should be installed: - -.. code-block:: bash - - %%shell - pip install torch - pip install torchvision - -""" - -# Import PyTorch -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import torch -import torch.nn as nn -import torch.nn.functional as F - -# Import library for profiling -import torch.utils.benchmark as benchmark -from torchvision.models import resnet18 - -# Import `optimize_torch` function -from tvm.contrib.torch import optimize_torch -from tvm.meta_schedule import TuneConfig - -###################################################################### -# Define a simple module written by PyTorch -# ----------------------------------------- - - -class SimpleModel(nn.Module): - def __init__(self): - super().__init__() - self.conv1 = nn.Conv2d(1, 20, 5) - self.conv2 = nn.Conv2d(20, 20, 5) - - def forward(self, x): - x = F.relu(self.conv1(x)) - return F.relu(self.conv2(x)) - - -###################################################################### -# Optimize SimpleModel by TVM MetaSchedule -# ---------------------------------------- -# We provide the `optimize_torch` function, which has the similar usage as `torch.jit.trace`. -# The PyTorch model to optimize, along with its example input, are provided by users. -# The PyTorch module will be tuned by TVM for the target hardware. -# Without providing extra information, the model will be tuned for CPU. - -simple_model = SimpleModel() -example_input = torch.randn(20, 1, 10, 10) -model_optimized_by_tvm = optimize_torch(simple_model, example_input) - -###################################################################### -# Save/Load module -# ---------------- -# We can save and load our tuned module like the standard `nn.Module`. - -# Let us run our tuned module. -ret1 = model_optimized_by_tvm(example_input) - -torch.save(model_optimized_by_tvm, "model_optimized.pt") -model_loaded = torch.load("model_optimized.pt") - -# We load the module and run it again. -ret2 = model_loaded(example_input) - -# We will show 2 results: -# (1) we can safely load and save model by showing the result of model -# after save and load operations is still the same as original one; -# (2) the model we optimize returns the same result as the original PyTorch model. - -ret3 = simple_model(example_input) -testing.assert_allclose(ret1.detach().numpy(), ret2.detach().numpy(), atol=1e-5, rtol=1e-5) -testing.assert_allclose(ret1.detach().numpy(), ret3.detach().numpy(), atol=1e-5, rtol=1e-5) - -###################################################################### -# Optimize resnet18 -# ----------------- -# In the following, we will show that our approach is able to -# accelerate common models, such as resnet18. - -# We will tune our model for the GPU. -target_cuda = "nvidia/geforce-rtx-3070" - -# For PyTorch users, the code could be written as usual, except for -# applying "optimize_torch" function on the resnet18 model. - -resnet18_tvm = optimize_torch( - resnet18().cuda().eval(), [torch.rand(1, 3, 224, 224).cuda()], target=target_cuda -) - -# TorchScript also provides a built-in "optimize_for_inference" function to accelerate the inference. -resnet18_torch = torch.jit.optimize_for_inference(torch.jit.script(resnet18().cuda().eval())) - - -###################################################################### -# Compare the performance between two approaches -# ---------------------------------------------- - -results = [] -for i in range(5): - test_input = torch.rand(1, 3, 224, 224).cuda() - sub_label = f"[test {i}]" - results.append( - benchmark.Timer( - stmt="resnet18_tvm(test_input)", - setup="from __main__ import resnet18_tvm", - globals={"test_input": test_input}, - sub_label=sub_label, - description="tuning by meta", - ).blocked_autorange() - ) - results.append( - benchmark.Timer( - stmt="resnet18_torch(test_input)", - setup="from __main__ import resnet18_torch", - globals={"test_input": test_input}, - sub_label=sub_label, - description="tuning by jit", - ).blocked_autorange() - ) - -compare = benchmark.Compare(results) -compare.print() - -# In author's environment, the average inference time of `resnet18_tvm` is 620.0 us, -# while the average inference time of `resnet18_torch` is 980.0 us (PyTorch version is 1.11.0), -# showing the speedup of around 38%. diff --git a/gallery/how_to/work_with_relay/README.txt b/gallery/how_to/work_with_relay/README.txt deleted file mode 100644 index e25e1782c03f..000000000000 --- a/gallery/how_to/work_with_relay/README.txt +++ /dev/null @@ -1,2 +0,0 @@ -Work With Relay ---------------- diff --git a/gallery/how_to/work_with_relay/build_gcn.py b/gallery/how_to/work_with_relay/build_gcn.py deleted file mode 100644 index a4bb5d58cd30..000000000000 --- a/gallery/how_to/work_with_relay/build_gcn.py +++ /dev/null @@ -1,344 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Building a Graph Convolutional Network -====================================== -**Author**: `Yulun Yao `_, \ - `Chien-Yu Lin `_ - -This article is an introductory tutorial to build a Graph Convolutional Network (GCN) with Relay. -In this tutorial, we will run our GCN on Cora dataset to demonstrate. -Cora dataset is a common benchmark for Graph Neural Networks (GNN) and frameworks that support GNN training and inference. -We directly load the dataset from DGL library to do the apples to apples comparison against DGL. - -.. code-block:: bash - - %%shell - pip install torch==2.0.0 - pip install dgl==v1.0.0 - -Please refer to DGL doc for installation at -https://docs.dgl.ai/install/index.html. - -Please refer to PyTorch guide for PyTorch installation at -https://pytorch.org/get-started/locally/. -""" - - -###################################################################### -# Define GCN in DGL with PyTorch backend -# -------------------------------------- -# -# DGL example: https://github.com/dmlc/dgl/tree/master/examples/pytorch/gcn -# This part reuses the code from the above example. -import torch -import torch.nn as nn -import torch.nn.functional as F -import dgl -import networkx as nx -from dgl.nn.pytorch import GraphConv - - -class GCN(nn.Module): - def __init__(self, g, n_infeat, n_hidden, n_classes, n_layers, activation): - super(GCN, self).__init__() - self.g = g - self.layers = nn.ModuleList() - self.layers.append(GraphConv(n_infeat, n_hidden, activation=activation)) - for i in range(n_layers - 1): - self.layers.append(GraphConv(n_hidden, n_hidden, activation=activation)) - self.layers.append(GraphConv(n_hidden, n_classes)) - - def forward(self, features): - h = features - for i, layer in enumerate(self.layers): - # handle api changes for differnt DGL version - if dgl.__version__ > "0.3": - h = layer(self.g, h) - else: - h = layer(h, self.g) - return h - - -###################################################################### -# Define the functions to load dataset and evaluate accuracy -# ---------------------------------------------------------- -# You may substitute this part with your own dataset, here we load data from DGL -from dgl.data import load_data -from collections import namedtuple - - -def evaluate(g, logits): - label = g.ndata["label"] - test_mask = g.ndata["test_mask"] - - pred = logits.argmax(axis=1) - acc = (torch.Tensor(pred[test_mask]) == label[test_mask]).float().mean() - - return acc - - -###################################################################### -# Load the data and set up model parameters -# ----------------------------------------- -""" -Parameters ----------- -num_layer: int - number of hidden layers - -num_hidden: int - number of the hidden units in the hidden layer - -infeat_dim: int - dimension of the input features - -num_classes: int - dimension of model output (Number of classes) -""" - -dataset = dgl.data.CoraGraphDataset() -dgl_g = dataset[0] -num_layers = 1 -num_hidden = 16 -features = dgl_g.ndata["feat"] -infeat_dim = features.shape[1] -num_classes = dataset.num_classes - -###################################################################### -# Set up the DGL-PyTorch model and get the golden results -# ------------------------------------------------------- -# -# The weights are trained with https://github.com/dmlc/dgl/blob/master/examples/pytorch/gcn/train.py -from tvm.contrib.download import download_testdata - -features = torch.FloatTensor(features) - -torch_model = GCN(dgl_g, infeat_dim, num_hidden, num_classes, num_layers, F.relu) - -# Download the pretrained weights -model_url = "https://homes.cs.washington.edu/~cyulin/media/gnn_model/gcn_cora.torch" -model_path = download_testdata(model_url, "gcn_cora.pickle", module="gcn_model") - -# Load the weights into the model -torch_model.load_state_dict(torch.load(model_path)) - - -###################################################################### -# Run the DGL model and test for accuracy -# --------------------------------------- -torch_model.eval() -with torch.no_grad(): - logits_torch = torch_model(features) -print("Print the first five outputs from DGL-PyTorch execution\n", logits_torch[:5]) - -acc = evaluate(dgl_g, logits_torch.numpy()) -print("Test accuracy of DGL results: {:.2%}".format(acc)) - - -###################################################################### -# Define Graph Convolution Layer in Relay -# --------------------------------------- -# To run GCN on TVM, we first need to implement Graph Convolution Layer. -# You may refer to https://github.com/dmlc/dgl/blob/master/python/dgl/nn/mxnet/conv/graphconv.py for a GraphConv Layer implemented in DGL with MXNet Backend -# -# The layer is defined with below operations, note that we apply two transposes to keep adjacency matrix on right hand side of sparse_dense operator, -# this method is temporary and will be updated in next few weeks when we have sparse matrix transpose and support for left sparse operator. -# -# .. math:: -# -# \mbox{GraphConv}(A, H, W) = A * H * W -# = ((H * W)^t * A^t)^t -# = ((W^t * H^t) * A^t)^t -from tvm import relay -from tvm.contrib import graph_executor -import tvm -from tvm import te - - -def GraphConv(layer_name, input_dim, output_dim, adj, input, norm=None, bias=True, activation=None): - """ - Parameters - ---------- - layer_name: str - Name of layer - - input_dim: int - Input dimension per node feature - - output_dim: int, - Output dimension per node feature - - adj: namedtuple, - Graph representation (Adjacency Matrix) in Sparse Format (`data`, `indices`, `indptr`), - where `data` has shape [num_nonzeros], indices` has shape [num_nonzeros], `indptr` has shape [num_nodes + 1] - - input: relay.Expr, - Input feature to current layer with shape [num_nodes, input_dim] - - norm: relay.Expr, - Norm passed to this layer to normalize features before and after Convolution. - - bias: bool - Set bias to True to add bias when doing GCN layer - - activation: , - Activation function applies to the output. e.g. relay.nn.{relu, sigmoid, log_softmax, softmax, leaky_relu} - - Returns - ---------- - output: tvm.relay.Expr - The Output Tensor for this layer [num_nodes, output_dim] - """ - if norm is not None: - input = relay.multiply(input, norm) - - weight = relay.var(layer_name + ".weight", shape=(input_dim, output_dim)) - weight_t = relay.transpose(weight) - dense = relay.nn.dense(weight_t, input) - output = relay.nn.sparse_dense(dense, adj) - output_t = relay.transpose(output) - if norm is not None: - output_t = relay.multiply(output_t, norm) - if bias is True: - _bias = relay.var(layer_name + ".bias", shape=(output_dim, 1)) - output_t = relay.nn.bias_add(output_t, _bias, axis=-1) - if activation is not None: - output_t = activation(output_t) - return output_t - - -###################################################################### -# Prepare the parameters needed in the GraphConv layers -# ----------------------------------------------------- -# -import numpy as np -import networkx as nx - - -def prepare_params(g): - params = {} - params["infeats"] = g.ndata["feat"].numpy().astype("float32") - - # Generate adjacency matrix - nx_graph = dgl.to_networkx(g) - adjacency = nx.to_scipy_sparse_array(nx_graph) - params["g_data"] = adjacency.data.astype("float32") - params["indices"] = adjacency.indices.astype("int32") - params["indptr"] = adjacency.indptr.astype("int32") - - # Normalization w.r.t. node degrees - degs = [g.in_degrees(i) for i in range(g.number_of_nodes())] - params["norm"] = np.power(degs, -0.5).astype("float32") - params["norm"] = params["norm"].reshape((params["norm"].shape[0], 1)) - - return params - - -params = prepare_params(dgl_g) - -# Check shape of features and the validity of adjacency matrix -assert len(params["infeats"].shape) == 2 -assert ( - params["g_data"] is not None and params["indices"] is not None and params["indptr"] is not None -) -assert params["infeats"].shape[0] == params["indptr"].shape[0] - 1 - -###################################################################### -# Put layers together -# ------------------- - -# Define input features, norms, adjacency matrix in Relay -infeats = relay.var("infeats", shape=features.shape) -norm = relay.Constant(tvm.nd.array(params["norm"])) -g_data = relay.Constant(tvm.nd.array(params["g_data"])) -indices = relay.Constant(tvm.nd.array(params["indices"])) -indptr = relay.Constant(tvm.nd.array(params["indptr"])) - -Adjacency = namedtuple("Adjacency", ["data", "indices", "indptr"]) -adj = Adjacency(g_data, indices, indptr) - -# Construct the 2-layer GCN -layers = [] -layers.append( - GraphConv( - layer_name="layers.0", - input_dim=infeat_dim, - output_dim=num_hidden, - adj=adj, - input=infeats, - norm=norm, - activation=relay.nn.relu, - ) -) -layers.append( - GraphConv( - layer_name="layers.1", - input_dim=num_hidden, - output_dim=num_classes, - adj=adj, - input=layers[-1], - norm=norm, - activation=None, - ) -) - -# Analyze free variables and generate Relay function -output = layers[-1] - -###################################################################### -# Compile and run with TVM -# ------------------------ -# -# Export the weights from PyTorch model to Python Dict -model_params = {} -for param_tensor in torch_model.state_dict(): - model_params[param_tensor] = torch_model.state_dict()[param_tensor].numpy() - -for i in range(num_layers + 1): - params["layers.%d.weight" % (i)] = model_params["layers.%d.weight" % (i)] - params["layers.%d.bias" % (i)] = model_params["layers.%d.bias" % (i)] - -# Set the TVM build target -target = "llvm" # Currently only support `llvm` as target - -func = relay.Function(relay.analysis.free_vars(output), output) -func = relay.build_module.bind_params_by_name(func, params) -mod = tvm.IRModule() -mod["main"] = func -# Build with Relay -with tvm.transform.PassContext(opt_level=0): # Currently only support opt_level=0 - lib = relay.build(mod, target, params=params) - -# Generate graph executor -dev = tvm.device(target, 0) -m = graph_executor.GraphModule(lib["default"](dev)) - -###################################################################### -# Run the TVM model, test for accuracy and verify with DGL -# -------------------------------------------------------- -m.run() -logits_tvm = m.get_output(0).numpy() -print("Print the first five outputs from TVM execution\n", logits_tvm[:5]) - -acc = evaluate(dgl_g, logits_tvm) -print("Test accuracy of TVM results: {:.2%}".format(acc)) - -import tvm.testing - -# Verify the results with the DGL model -tvm.testing.assert_allclose(logits_torch, logits_tvm, atol=1e-3) diff --git a/gallery/how_to/work_with_relay/using_external_lib.py b/gallery/how_to/work_with_relay/using_external_lib.py deleted file mode 100644 index 38f5b2d460ba..000000000000 --- a/gallery/how_to/work_with_relay/using_external_lib.py +++ /dev/null @@ -1,562 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Using External Libraries in Relay -================================= -**Author**: `Masahiro Masuda `_, `Truman Tian `_ - -This is a short tutorial on how to use external libraries such as cuDNN, or cuBLAS with Relay. - -Relay uses TVM internally to generate target specific code. For example, with cuda backend TVM generates cuda kernels for all layers in the user provided network. -But sometimes it is also helpful to incorporate external libraries developed by various vendors into Relay. -Luckily, TVM has a mechanism to transparently call into these libraries. -For Relay users, all we need to do is just to set a target string appropriately. - -Before we can use external libraries from Relay, your TVM needs to be built with libraries you want to use. -For example, to use cuDNN, USE_CUDNN option in `cmake/config.cmake` needs to be enabled, and cuDNN include and library directories need to be specified if necessary. - -To begin with, we import Relay and TVM. -""" - -import tvm -from tvm import te -import numpy as np -from tvm.contrib import graph_executor as runtime -from tvm import relay -from tvm.relay import testing -import tvm.testing - -###################################################################### -# Create a simple network -# ----------------------- -# Let's create a very simple network for demonstration. -# It consists of convolution, batch normalization, and ReLU activation. - -out_channels = 16 -batch_size = 1 - -data = relay.var("data", relay.TensorType((batch_size, 3, 224, 224), "float32")) -weight = relay.var("weight") -bn_gamma = relay.var("bn_gamma") -bn_beta = relay.var("bn_beta") -bn_mmean = relay.var("bn_mean") -bn_mvar = relay.var("bn_var") - -simple_net = relay.nn.conv2d( - data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1) -) -simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0] -simple_net = relay.nn.relu(simple_net) -simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net) - -data_shape = (batch_size, 3, 224, 224) -net, params = testing.create_workload(simple_net) - -###################################################################### -# Build and run with cuda backend -# ------------------------------- -# We build and run this network with cuda backend, as usual. -# By setting the logging level to DEBUG, the result of Relay graph compilation will be dumped as pseudo code. -import logging - -logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion - -target = "cuda" -lib = relay.build_module.build(net, target, params=params) - -dev = tvm.device(target, 0) -data = np.random.uniform(-1, 1, size=data_shape).astype("float32") -module = runtime.GraphModule(lib["default"](dev)) -module.set_input("data", data) -module.run() -out_shape = (batch_size, out_channels, 224, 224) -out = module.get_output(0, tvm.nd.empty(out_shape)) -out_cuda = out.numpy() -###################################################################### -# The generated pseudo code should look something like below. -# Note how bias add, batch normalization, and ReLU activation are fused into the convolution kernel. -# TVM generates a single, fused kernel from this representation. -# -# .. code-block:: text -# -# produce tensor { -# // attr [iter_var(blockIdx.z, , blockIdx.z)] thread_extent = 1 -# // attr [compute] storage_scope = "local" -# allocate compute[float32 * 32] -# // attr [pad_temp.shared] storage_scope = "shared" -# allocate pad_temp.shared[float32 * 180] -# // attr [placeholder.shared] storage_scope = "shared" -# allocate placeholder.shared[float32 * 144] -# // attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 28 -# // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 14 -# // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4 -# // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1 -# // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16 -# produce compute { -# compute[0] = 0.000000f -# compute[1] = 0.000000f -# compute[2] = 0.000000f -# compute[3] = 0.000000f -# compute[4] = 0.000000f -# compute[5] = 0.000000f -# compute[6] = 0.000000f -# compute[7] = 0.000000f -# compute[8] = 0.000000f -# compute[9] = 0.000000f -# compute[10] = 0.000000f -# compute[11] = 0.000000f -# compute[12] = 0.000000f -# compute[13] = 0.000000f -# compute[14] = 0.000000f -# compute[15] = 0.000000f -# compute[16] = 0.000000f -# compute[17] = 0.000000f -# compute[18] = 0.000000f -# compute[19] = 0.000000f -# compute[20] = 0.000000f -# compute[21] = 0.000000f -# compute[22] = 0.000000f -# compute[23] = 0.000000f -# compute[24] = 0.000000f -# compute[25] = 0.000000f -# compute[26] = 0.000000f -# compute[27] = 0.000000f -# compute[28] = 0.000000f -# compute[29] = 0.000000f -# compute[30] = 0.000000f -# compute[31] = 0.000000f -# for (rc.outer, 0, 3) { -# produce pad_temp.shared { -# // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4 -# // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1 -# // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16 -# if (likely(((threadIdx.z*15) < (60 - threadIdx.x)))) { -# if (likely((threadIdx.x < 15))) { -# pad_temp.shared[(((((threadIdx.z*15) + threadIdx.x)/60)*180) + ((((((threadIdx.z*15) + threadIdx.x)/6) % 10)*18) + ((((threadIdx.z*3) + threadIdx.x)*3) % 18)))] = tvm_if_then_else((((((1 - ((((threadIdx.z*15) + threadIdx.x)/6) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((threadIdx.z*15) + threadIdx.x)/6) % 10)))) && ((1 - ((((threadIdx.z*3) + threadIdx.x)*3) % 18)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - ((((threadIdx.z*3) + threadIdx.x)*3) % 18)))), placeholder[((((((((blockIdx.y*112) + blockIdx.x) + (rc.outer*3136)) + ((((threadIdx.z*15) + threadIdx.x)/60)*9408))*16) + ((((threadIdx.z*3) + threadIdx.x)*3) % 18)) + (((((threadIdx.z*15) + threadIdx.x)/6) % 10)*224)) + -225)], 0.000000f) -# pad_temp.shared[(((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/180)*180) + ((((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)*18) + (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)))] = tvm_if_then_else((((((1 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)))) && ((1 - (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)))), placeholder[((((((((blockIdx.y*112) + blockIdx.x) + (rc.outer*3136)) + ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/180)*9408))*16) + (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)) + (((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)*224)) + -225)], 0.000000f) -# pad_temp.shared[(((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/180)*180) + ((((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)*18) + (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)))] = tvm_if_then_else((((((1 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)))) && ((1 - (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)))), placeholder[((((((((blockIdx.y*112) + blockIdx.x) + (rc.outer*3136)) + ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/180)*9408))*16) + (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)) + (((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)*224)) + -225)], 0.000000f) -# } -# } -# } -# produce placeholder.shared { -# // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4 -# // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1 -# // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16 -# if (likely(((threadIdx.z*4) < (16 - (threadIdx.x/3))))) { -# if (likely(((threadIdx.z*12) < (48 - threadIdx.x)))) { -# if (likely((threadIdx.x < 12))) { -# placeholder.shared[(((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3)] = placeholder[(((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3)] -# placeholder.shared[((((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3) + 1)] = placeholder[((((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3) + 1)] -# placeholder.shared[((((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3) + 2)] = placeholder[((((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3) + 2)] -# } -# } -# } -# } -# compute[0] = (compute[0] + (pad_temp.shared[threadIdx.x]*placeholder.shared[(threadIdx.z*36)])) -# compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[(threadIdx.z*36)])) -# compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[(threadIdx.z*36)])) -# compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[(threadIdx.z*36)])) -# compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[(threadIdx.z*36)])) -# compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[(threadIdx.z*36)])) -# compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[(threadIdx.z*36)])) -# compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[(threadIdx.z*36)])) -# compute[8] = (compute[8] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 9)])) -# compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 9)])) -# compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 9)])) -# compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 9)])) -# compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 9)])) -# compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 9)])) -# compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 9)])) -# compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 9)])) -# compute[16] = (compute[16] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 18)])) -# compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 18)])) -# compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 18)])) -# compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 18)])) -# compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 18)])) -# compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 18)])) -# compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 18)])) -# compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 18)])) -# compute[24] = (compute[24] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 27)])) -# compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 27)])) -# compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 27)])) -# compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 27)])) -# compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 27)])) -# compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 27)])) -# compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 27)])) -# compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 27)])) -# compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 1)])) -# compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 1)])) -# compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 1)])) -# compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 1)])) -# compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 1)])) -# compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 1)])) -# compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 1)])) -# compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 1)])) -# compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 10)])) -# compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 10)])) -# compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 10)])) -# compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 10)])) -# compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 10)])) -# compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 10)])) -# compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 10)])) -# compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 10)])) -# compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 19)])) -# compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 19)])) -# compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 19)])) -# compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 19)])) -# compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 19)])) -# compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 19)])) -# compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 19)])) -# compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 19)])) -# compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 28)])) -# compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 28)])) -# compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 28)])) -# compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 28)])) -# compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 28)])) -# compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 28)])) -# compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 28)])) -# compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 28)])) -# compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 2)])) -# compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 2)])) -# compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 2)])) -# compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 2)])) -# compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 2)])) -# compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 2)])) -# compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 2)])) -# compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 2)])) -# compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 11)])) -# compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 11)])) -# compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 11)])) -# compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 11)])) -# compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 11)])) -# compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 11)])) -# compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 11)])) -# compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 11)])) -# compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 20)])) -# compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 20)])) -# compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 20)])) -# compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 20)])) -# compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 20)])) -# compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 20)])) -# compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 20)])) -# compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 20)])) -# compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 29)])) -# compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 29)])) -# compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 29)])) -# compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 29)])) -# compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 29)])) -# compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 29)])) -# compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 29)])) -# compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 29)])) -# compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 3)])) -# compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 3)])) -# compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 3)])) -# compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 3)])) -# compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 3)])) -# compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 3)])) -# compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 3)])) -# compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 3)])) -# compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 12)])) -# compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 12)])) -# compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 12)])) -# compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 12)])) -# compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 12)])) -# compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 12)])) -# compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 12)])) -# compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 12)])) -# compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 21)])) -# compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 21)])) -# compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 21)])) -# compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 21)])) -# compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 21)])) -# compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 21)])) -# compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 21)])) -# compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 21)])) -# compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 30)])) -# compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 30)])) -# compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 30)])) -# compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 30)])) -# compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 30)])) -# compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 30)])) -# compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 30)])) -# compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 30)])) -# compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 4)])) -# compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 4)])) -# compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 4)])) -# compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 4)])) -# compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 4)])) -# compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 4)])) -# compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 4)])) -# compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 4)])) -# compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 13)])) -# compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 13)])) -# compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 13)])) -# compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 13)])) -# compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 13)])) -# compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 13)])) -# compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 13)])) -# compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 13)])) -# compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 22)])) -# compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 22)])) -# compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 22)])) -# compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 22)])) -# compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 22)])) -# compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 22)])) -# compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 22)])) -# compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 22)])) -# compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 31)])) -# compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 31)])) -# compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 31)])) -# compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 31)])) -# compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 31)])) -# compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 31)])) -# compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 31)])) -# compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 31)])) -# compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 5)])) -# compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 5)])) -# compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 5)])) -# compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 5)])) -# compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 5)])) -# compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 5)])) -# compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 5)])) -# compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 5)])) -# compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 14)])) -# compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 14)])) -# compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 14)])) -# compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 14)])) -# compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 14)])) -# compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 14)])) -# compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 14)])) -# compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 14)])) -# compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 23)])) -# compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 23)])) -# compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 23)])) -# compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 23)])) -# compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 23)])) -# compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 23)])) -# compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 23)])) -# compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 23)])) -# compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 32)])) -# compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 32)])) -# compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 32)])) -# compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 32)])) -# compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 32)])) -# compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 32)])) -# compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 32)])) -# compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 32)])) -# compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 6)])) -# compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 6)])) -# compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 6)])) -# compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 6)])) -# compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 6)])) -# compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 6)])) -# compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 6)])) -# compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 6)])) -# compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 15)])) -# compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 15)])) -# compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 15)])) -# compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 15)])) -# compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 15)])) -# compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 15)])) -# compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 15)])) -# compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 15)])) -# compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 24)])) -# compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 24)])) -# compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 24)])) -# compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 24)])) -# compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 24)])) -# compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 24)])) -# compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 24)])) -# compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 24)])) -# compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 33)])) -# compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 33)])) -# compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 33)])) -# compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 33)])) -# compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 33)])) -# compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 33)])) -# compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 33)])) -# compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 33)])) -# compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 7)])) -# compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 7)])) -# compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 7)])) -# compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 7)])) -# compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 7)])) -# compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 7)])) -# compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 7)])) -# compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 7)])) -# compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 16)])) -# compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 16)])) -# compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 16)])) -# compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 16)])) -# compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 16)])) -# compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 16)])) -# compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 16)])) -# compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 16)])) -# compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 25)])) -# compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 25)])) -# compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 25)])) -# compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 25)])) -# compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 25)])) -# compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 25)])) -# compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 25)])) -# compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 25)])) -# compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 34)])) -# compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 34)])) -# compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 34)])) -# compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 34)])) -# compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 34)])) -# compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 34)])) -# compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 34)])) -# compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 34)])) -# compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 8)])) -# compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 8)])) -# compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 8)])) -# compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 8)])) -# compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 8)])) -# compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 8)])) -# compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 8)])) -# compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 8)])) -# compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 17)])) -# compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 17)])) -# compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 17)])) -# compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 17)])) -# compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 17)])) -# compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 17)])) -# compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 17)])) -# compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 17)])) -# compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 26)])) -# compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 26)])) -# compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 26)])) -# compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 26)])) -# compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 26)])) -# compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 26)])) -# compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 26)])) -# compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 26)])) -# compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 35)])) -# compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 35)])) -# compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 35)])) -# compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 35)])) -# compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 35)])) -# compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 35)])) -# compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 35)])) -# compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 35)])) -# } -# } -# tensor[(((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x)] = max(((compute[0]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 224)] = max(((compute[1]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 448)] = max(((compute[2]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 672)] = max(((compute[3]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 896)] = max(((compute[4]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1120)] = max(((compute[5]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1344)] = max(((compute[6]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1568)] = max(((compute[7]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50176)] = max(((compute[8]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50400)] = max(((compute[9]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50624)] = max(((compute[10]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50848)] = max(((compute[11]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51072)] = max(((compute[12]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51296)] = max(((compute[13]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51520)] = max(((compute[14]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51744)] = max(((compute[15]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100352)] = max(((compute[16]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100576)] = max(((compute[17]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100800)] = max(((compute[18]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101024)] = max(((compute[19]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101248)] = max(((compute[20]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101472)] = max(((compute[21]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101696)] = max(((compute[22]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101920)] = max(((compute[23]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150528)] = max(((compute[24]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150752)] = max(((compute[25]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150976)] = max(((compute[26]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151200)] = max(((compute[27]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151424)] = max(((compute[28]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151648)] = max(((compute[29]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151872)] = max(((compute[30]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f) -# tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 152096)] = max(((compute[31]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f) -# } - -###################################################################### -# Use cuDNN for a convolutional layer -# ----------------------------------- -# We can use cuDNN to replace convolution kernels with cuDNN ones. -# To do that, all we need to do is to append the option " -libs=cudnn" to the target string. -net, params = testing.create_workload(simple_net) -target = "cuda -libs=cudnn" # use cudnn for convolution -lib = relay.build_module.build(net, target, params=params) - -dev = tvm.device(target, 0) -data = np.random.uniform(-1, 1, size=data_shape).astype("float32") -module = runtime.GraphModule(lib["default"](dev)) -module.set_input("data", data) -module.run() -out_shape = (batch_size, out_channels, 224, 224) -out = module.get_output(0, tvm.nd.empty(out_shape)) -out_cudnn = out.numpy() - -###################################################################### -# Note that if you use cuDNN, Relay cannot fuse convolution with layers following it. -# This is because layer fusion happens at the level of TVM internal representation(IR). -# Relay treats external libraries as black box, so there is no way to fuse them with TVM IR. -# -# The pseudo code below shows that cuDNN convolution + bias add + batch norm + ReLU turned into two stages of computation, one for cuDNN call and the other for the rest of operations. -# -# .. code-block:: text -# -# // attr [y] storage_scope = "global" -# allocate y[float32 * 802816] -# produce y { -# // attr [0] extern_scope = 0 -# tvm_call_packed("tvm.contrib.cudnn.conv2d.forward", 1, 0, 1, 1, 1, 1, 1, 1, 1, tvm_stack_make_array(placeholder, tvm_stack_make_shape(1, 3, 224, 224), 0, 4, 0.000000f, 0), tvm_stack_make_array(placeholder, tvm_stack_make_shape(16, 3, 3, 3), 0, 4, 0.000000f, 0), tvm_stack_make_array(y, tvm_stack_make_shape(1, 16, 224, 224), 0, 4, 0.000000f, 0)) -# } -# produce tensor { -# // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 256 -# // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 512 -# for (ax0.ax1.fused.ax2.fused.ax3.fused.outer, 0, 7) { -# if (likely(((blockIdx.x*512) < ((802816 - (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072)) - threadIdx.x)))) { -# tensor[(((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/802816)*802816) + (((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/224) % 224)*224) + ((((blockIdx.x*64) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*32)) % 224))) + ((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)*50176))] = max(((y[(((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/802816)*802816) + (((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/224) % 224)*224) + ((((blockIdx.x*64) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*32)) % 224))) + ((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)*50176))]*placeholder[(((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)]) + placeholder[(((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)]), 0.000000f) -# } -# } -# } - - -###################################################################### -# Verify the result -# ----------------- -# We can check that the results of two runs match. - -tvm.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5) - -##################################################################### -# Conclusion -# ---------- -# This tutorial covered the usage of cuDNN with Relay. -# We also have support for cuBLAS. If cuBLAS is enabled, it will be used inside a fully connected layer (relay.dense). -# To use cuBLAS, set a target string as "cuda -libs=cublas". -# You can use both cuDNN and cuBLAS with "cuda -libs=cudnn,cublas". -# -# For ROCm backend, we have support for MIOpen and rocBLAS. -# They can be enabled with target "rocm -libs=miopen,rocblas". -# -# Being able to use external libraries is great, but we need to keep in mind some cautions. -# -# First, the use of external libraries may restrict your usage of TVM and Relay. -# For example, MIOpen only supports NCHW layout and fp32 data type at the moment, so you cannot use other layouts or data type in TVM. -# -# Second, and more importantly, external libraries restrict the possibility of operator fusion during graph compilation, as shown above. -# TVM and Relay aim to achieve the best performance on a variety of hardwares, with joint operator level and graph level optimization. -# To achieve this goal, we should continue developing better optimizations for TVM and Relay, while using external libraries as a nice way to fall back to existing implementation when necessary. diff --git a/gallery/how_to/work_with_relay/using_pipeline_executor.py b/gallery/how_to/work_with_relay/using_pipeline_executor.py deleted file mode 100644 index 8bb53cc743c3..000000000000 --- a/gallery/how_to/work_with_relay/using_pipeline_executor.py +++ /dev/null @@ -1,232 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Using Pipeline Executor in Relay -================================= -**Author**: `Hua Jiang `_ - -This is a short tutorial on how to use "Pipeline Executor" with Relay. -""" -import tvm -from tvm import te -import numpy as np -from tvm.contrib import graph_executor as runtime -from tvm.relay.op.contrib.cutlass import partition_for_cutlass -from tvm import relay -from tvm.relay import testing -import tvm.testing -from tvm.contrib.cutlass import finalize_modules - -img_size = 8 -####################################################################### -# Create a simple network, this network can be a pre-trained model too. -# --------------------------------------------------------------------- -# Let's create a very simple network for demonstration. -# It consists of convolution, batch normalization, dense, and ReLU activation. -def get_network(): - out_channels = 16 - batch_size = 1 - data = relay.var("data", relay.TensorType((batch_size, 3, img_size, img_size), "float16")) - dense_weight = relay.var( - "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), "float16") - ) - weight = relay.var("weight") - bn_gamma = relay.var("bn_gamma") - bn_beta = relay.var("bn_beta") - bn_mmean = relay.var("bn_mean") - bn_mvar = relay.var("bn_var") - simple_net = relay.nn.conv2d( - data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, padding=(1, 1) - ) - simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0] - simple_net = relay.nn.relu(simple_net) - simple_net = relay.nn.batch_flatten(simple_net) - simple_net = relay.nn.dense(simple_net, dense_weight) - simple_net = relay.Function(relay.analysis.free_vars(simple_net), simple_net) - data_shape = (batch_size, 3, img_size, img_size) - net, params = testing.create_workload(simple_net) - return net, params, data_shape - - -net, params, data_shape = get_network() -########################################### -# Splitting the network into two subgraphs. -# ----------------------------------------- -# This function called 'graph_split' from a unit test is just an example. User can create a customized logic -# to split the graph. -import inspect -import os - -tutorial_dir = os.path.dirname(inspect.getfile(lambda: None)) -os.sys.path.append(os.path.join(tutorial_dir, "../../../tests/python/relay")) -from test_pipeline_executor import graph_split - -########################################### -# Splitting the network into two subgraphs. -split_config = [{"op_name": "nn.relu", "op_index": 0}] -subgraphs = graph_split(net["main"], split_config, params) -########################################################### -# The generated subgraphs should look something like below. - -""" -#subgraphs[0]) - - def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) { - %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* ty=Tensor[(1, 16, img_size, img_size), float16] */; - %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] */, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] /* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), float16], Tensor[(16), float16], Tensor[(16), float16]) */; - %2 = %1.0; - nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */ - } - -#subgraphs[1] - - def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 8), float16] */) { - %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */; - nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, units=None) /* ty=Tensor[(1, 1), float16] */ - } - -""" - - -######################################### -# Build the subgraph with cutlass target. -# --------------------------------------- - -cutlass = tvm.target.Target( - { - "kind": "cutlass", - "sm": int(tvm.target.Target("cuda").arch.split("_")[1]), - "use_3xtf32": True, - "split_k_slices": [1], - "profile_all_alignments": False, - "find_first_valid": True, - "use_multiprocessing": True, - "use_fast_math": False, - "tmp_dir": "./tmp", - }, - host=tvm.target.Target("llvm"), -) - - -def cutlass_build(mod, target, params=None, target_host=None, mod_name="default"): - target = [target, cutlass] - lib = relay.build_module.build( - mod, target=target, params=params, target_host=target_host, mod_name=mod_name - ) - return lib - - -########################################################### -# Run the two subgraphs in pipeline with pipeline executor. -# --------------------------------------------------------- -# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON in cmake. -from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build - -######################################### -# Create subgraph pipeline configuration. -# Associate a subgraph module with a target. -# Use CUTLASS BYOC to build the second subgraph module. -mod0, mod1 = subgraphs[0], subgraphs[1] -# Use cutlass as the codegen. -mod1 = partition_for_cutlass(mod1) -################################################# -# Get the pipeline executor configuration object. -pipe_config = pipeline_executor_build.PipelineConfig() -########################################################################### -# Set the compile target of the subgraph module. -pipe_config[mod0].target = "llvm" -pipe_config[mod0].dev = tvm.cpu(0) -############################################################## -# Set the compile target of the second subgraph module as cuda. -pipe_config[mod1].target = "cuda" -pipe_config[mod1].dev = tvm.device("cuda", 0) -pipe_config[mod1].build_func = cutlass_build -pipe_config[mod1].export_cc = "nvcc" -# Create the pipeline by connecting the subgraph modules. -# The global input will be forwarded to the input interface of the first module named mod0 -pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"]) -# The first output of mod0 will be forwarded to the input interface of mod1 -pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"]) -# The first output of mod1 will be the first global output. -pipe_config[mod1]["output"][0].connect(pipe_config["output"][0]) -###################################### -# The pipeline configuration as below. -""" -print(pipe_config) - Inputs - |data: mod0:data - - output - |output(0) : mod1.output(0) - - connections - |mod0.output(0)-> mod1.data_n_0 -""" - -############################## -# Build the pipeline executor. -# ---------------------------- -with tvm.transform.PassContext(opt_level=3): - pipeline_mod_factory = pipeline_executor_build.build(pipe_config) -############################################### -# Export the parameter configuration to a file. -directory_path = tvm.contrib.utils.tempdir().temp_dir -os.makedirs(directory_path, exist_ok=True) -config_file_name = pipeline_mod_factory.export_library(directory_path) -################################################################ -# Use the load function to create and initialize PipelineModule. -# -------------------------------------------------------------- -pipeline_module = pipeline_executor.PipelineModule.load_library(config_file_name) - -############################ -# Run the pipeline executor. -# -------------------------- -# Allocate input data. -data = np.random.uniform(-1, 1, size=data_shape).astype("float16") -pipeline_module.set_input("data", tvm.nd.array(data)) -########################################################################## -# Run the two subgraph in the pipeline mode to get the output asynchronously -# or synchronously. In the following example, it is synchronous. -pipeline_module.run() -outputs = pipeline_module.get_output() -###################################### -# Use graph_executor for verification. -# ------------------------------------ -# Run these two subgraphs in sequence with graph_executor to get the output. -target = "llvm" -dev0 = tvm.device(target, 0) -lib0 = relay.build_module.build(mod0, target, params=params) -module0 = runtime.GraphModule(lib0["default"](dev0)) -cuda = tvm.target.Target("cuda", host=tvm.target.Target("llvm")) -lib1 = relay.build_module.build(mod1, [cuda, cutlass], params=params) -lib1 = finalize_modules(lib1, "compile.so", "./tmp") - -dev1 = tvm.device("cuda", 0) - -module1 = runtime.GraphModule(lib1["default"](dev1)) - -module0.set_input("data", data) -module0.run() -out_shape = (1, 16, img_size, img_size) -out = module0.get_output(0, tvm.nd.empty(out_shape, "float16")) -module1.set_input("data_n_0", out) -module1.run() -out_shape = (1, 1) -out = module1.get_output(0, tvm.nd.empty(out_shape, "float16")) -#################### -# Verify the result. -tvm.testing.assert_allclose(outputs[0].numpy(), out.numpy()) diff --git a/gallery/how_to/work_with_relay/using_relay_viz.py b/gallery/how_to/work_with_relay/using_relay_viz.py deleted file mode 100644 index ce874ca48508..000000000000 --- a/gallery/how_to/work_with_relay/using_relay_viz.py +++ /dev/null @@ -1,169 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=line-too-long -""" -Use Relay Visualizer to Visualize Relay -============================================================ -**Author**: `Chi-Wei Wang `_ - -Relay IR module can contain lots of operations. Although an individual -operation is usually easy to understand, putting them together can cause -a complicated, hard-to-read graph. Things can get even worse with optimization-passes -coming into play. - -This utility visualizes an IR module as nodes and edges. It defines a set of interfaces including -parser, plotter(renderer), graph, node, and edges. -A default parser is provided. Users can implement their own renderers to render the graph. - -Here we use a renderer rendering graph in the text-form. -It is a lightweight, AST-like visualizer, inspired by `clang ast-dump `_. -We will introduce how to implement customized parsers and renderers through interface classes. -To install dependencies, run: - -.. code-block:: bash - - %%shell - pip install graphviz - - -For more details, please refer to :py:mod:`tvm.contrib.relay_viz`. -""" - -from typing import ( - Dict, - Union, - Tuple, - List, -) -import tvm -from tvm import relay -from tvm.contrib import relay_viz -from tvm.contrib.relay_viz.interface import ( - VizEdge, - VizNode, - VizParser, -) -from tvm.contrib.relay_viz.terminal import ( - TermGraph, - TermPlotter, - TermVizParser, -) - -###################################################################### -# Define a Relay IR Module with multiple GlobalVar -# ------------------------------------------------ -# Let's build an example Relay IR Module containing multiple ``GlobalVar``. -# We define an ``add`` function and call it in the main function. -data = relay.var("data") -bias = relay.var("bias") -add_op = relay.add(data, bias) -add_func = relay.Function([data, bias], add_op) -add_gvar = relay.GlobalVar("AddFunc") - -input0 = relay.var("input0") -input1 = relay.var("input1") -input2 = relay.var("input2") -add_01 = relay.Call(add_gvar, [input0, input1]) -add_012 = relay.Call(add_gvar, [input2, add_01]) -main_func = relay.Function([input0, input1, input2], add_012) -main_gvar = relay.GlobalVar("main") - -mod = tvm.IRModule({main_gvar: main_func, add_gvar: add_func}) - -###################################################################### -# Render the graph with Relay Visualizer on the terminal -# ------------------------------------------------------ -# The terminal can show a Relay IR module in text similar to clang AST-dump. -# We should see ``main`` and ``AddFunc`` function. ``AddFunc`` is called twice in the ``main`` function. -viz = relay_viz.RelayVisualizer(mod) -viz.render() - -###################################################################### -# Customize Parser for Interested Relay Types -# ------------------------------------------- -# Sometimes we want to emphasize interested information, or parse things differently for a specific usage. -# It is possible to provide customized parsers as long as it obeys the interface. -# Here demonstrate how to customize parsers for ``relay.var``. -# We need to implement abstract interface :py:class:`tvm.contrib.relay_viz.interface.VizParser`. -class YourAwesomeParser(VizParser): - def __init__(self): - self._delegate = TermVizParser() - - def get_node_edges( - self, - node: relay.Expr, - relay_param: Dict[str, tvm.runtime.NDArray], - node_to_id: Dict[relay.Expr, str], - ) -> Tuple[Union[VizNode, None], List[VizEdge]]: - - if isinstance(node, relay.Var): - node = VizNode(node_to_id[node], "AwesomeVar", f"name_hint {node.name_hint}") - # no edge is introduced. So return an empty list. - return node, [] - - # delegate other types to the other parser. - return self._delegate.get_node_edges(node, relay_param, node_to_id) - - -###################################################################### -# Pass the parser and an interested renderer to visualizer. -# Here we just the terminal renderer. -viz = relay_viz.RelayVisualizer(mod, {}, TermPlotter(), YourAwesomeParser()) -viz.render() - -###################################################################### -# Customization around Graph and Plotter -# ------------------------------------------- -# Besides parsers, we can also customize graph and renderers by implementing -# abstract class :py:class:`tvm.contrib.relay_viz.interface.VizGraph` and -# :py:class:`tvm.contrib.relay_viz.interface.Plotter`. -# Here we override the ``TermGraph`` defined in ``terminal.py`` for easier demo. -# We add a hook duplicating above ``AwesomeVar``, and make ``TermPlotter`` use the new class. -class AwesomeGraph(TermGraph): - def node(self, viz_node): - # add the node first - super().node(viz_node) - # if it's AwesomeVar, duplicate it. - if viz_node.type_name == "AwesomeVar": - duplicated_id = f"duplicated_{viz_node.identity}" - duplicated_type = "double AwesomeVar" - super().node(VizNode(duplicated_id, duplicated_type, "")) - # connect the duplicated var to the original one - super().edge(VizEdge(duplicated_id, viz_node.identity)) - - -# override TermPlotter to use `AwesomeGraph` instead -class AwesomePlotter(TermPlotter): - def create_graph(self, name): - self._name_to_graph[name] = AwesomeGraph(name) - return self._name_to_graph[name] - - -viz = relay_viz.RelayVisualizer(mod, {}, AwesomePlotter(), YourAwesomeParser()) -viz.render() - -###################################################################### -# Summary -# ------- -# This tutorial demonstrates the usage of Relay Visualizer and customization. -# The class :py:class:`tvm.contrib.relay_viz.RelayVisualizer` is composed of interfaces -# defined in ``interface.py``. -# -# It is aimed for quick look-then-fix iterations. -# The constructor arguments are intended to be simple, while the customization is still -# possible through a set of interface classes. -# diff --git a/gallery/how_to/work_with_schedules/README.txt b/gallery/how_to/work_with_schedules/README.txt deleted file mode 100644 index 6516af3b3d2f..000000000000 --- a/gallery/how_to/work_with_schedules/README.txt +++ /dev/null @@ -1,2 +0,0 @@ -Work With Tensor Expression and Schedules ------------------------------------------ diff --git a/gallery/how_to/work_with_schedules/extern_op.py b/gallery/how_to/work_with_schedules/extern_op.py deleted file mode 100644 index 9026eb016c56..000000000000 --- a/gallery/how_to/work_with_schedules/extern_op.py +++ /dev/null @@ -1,140 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -External Tensor Functions -========================= -**Author**: `Tianqi Chen `_ - -While TVM supports transparent code generation, sometimes -it is also helpful to incorporate manual written code into -the pipeline. For example, we might want to use cuDNN for -some of the convolution kernels and define the rest of the stages. - -TVM supports these black box function calls natively. -Specifically, TVM support all the tensor functions that are DLPack compatible. -Which means we can call any function with POD types(pointer, int, float) -or pointer to DLTensor as argument. -""" -from __future__ import absolute_import, print_function - - -import tvm -from tvm import te -import numpy as np -from tvm.contrib import cblas -import tvm.testing - -if not tvm.get_global_func("tvm.contrib.cblas.matmul", allow_missing=True): - raise Exception("Not compiled with cblas support; can't build this tutorial") - -###################################################################### -# Use Extern Tensor Function -# -------------------------- -# In the example below, we use :any:`te.extern` to add an extern -# array function call. In the extern call, we declare the shape -# of output tensors. In the second argument we provide the list of inputs. -# -# User will need to provide a function describing how to compute the result. -# The compute function takes list of symbolic placeholder for the inputs, -# list of symbolic placeholder for the outputs and returns the executing statement. -# -# In this case we simply call a registered TVM function, which invokes a CBLAS call. -# TVM does not control internal of the extern array function and treats it as black-box. -# We can further mix schedulable TVM calls that add a bias term to the result. -# -n = 1024 -l = 128 -m = 235 -bias = te.var("bias", dtype="float32") -A = te.placeholder((n, l), name="A") -B = te.placeholder((l, m), name="B") -C = te.extern( - (n, m), - [A, B], - lambda ins, outs: tvm.tir.call_packed( - "tvm.contrib.cblas.matmul", ins[0], ins[1], outs[0], False, False - ), - name="C", -) -D = te.compute(C.shape, lambda i, j: C[i, j] + bias, name="D") -s = te.create_schedule(D.op) - -###################################################################### -# Verify the Result -# ----------------- -# We can verify that the result matches what we expected. -# -dev = tvm.cpu(0) -f = tvm.build(s, [A, B, D, bias], "llvm") -a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), dev) -b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), dev) -d = tvm.nd.array(np.zeros((n, m), dtype=D.dtype), dev) -bb = 10.0 -f(a, b, d, bb) -tvm.testing.assert_allclose(d.numpy(), np.dot(a.numpy(), b.numpy()) + 10, rtol=1e-5) - -###################################################################### -# Extern Contrib Wrappers -# ----------------------- -# TVM also provide extern contrib wrappers to useful extern calls, -# the following line is equivalent to the previous example. -# -from tvm.contrib import cblas - -C = cblas.matmul(A, B) -D = te.compute(C.shape, lambda i, j: C[i, j] + bias, name="D") -s = te.create_schedule(D.op) - -###################################################################### -# Hook Python Function as Extern -# ------------------------------ -# Since we can call into any PackedFunc in TVM. We can use the extern -# function to callback into python. -# -# The following example registers a python function into TVM runtime system -# and use it to complete one stage of the computation. -# This makes TVM much more flexible. For example, we can insert front-end -# callbacks to inspect the intermediate results or mix customized code -# with TVM. -# -@tvm.register_func("tvm.contrib.my_tvm_addone") -def my_tvm_addone(x, y): - print("my_tvm_addone signatures: %s, %s" % (type(x), type(y))) - tvm.nd.array(x.numpy() + 1).copyto(y) - - -A = te.placeholder((n,), name="A") -B = te.extern( - A.shape, - [A], - lambda ins, outs: tvm.tir.call_packed("tvm.contrib.my_tvm_addone", ins[0], outs[0]), - name="C", -) -s = te.create_schedule(B.op) -f = tvm.build(s, [A, B], "llvm") -a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), dev) -b = tvm.nd.array(np.random.uniform(size=(n,)).astype(B.dtype), dev) -f(a, b) -tvm.testing.assert_allclose(b.numpy(), a.numpy() + 1, rtol=1e-5) - -###################################################################### -# Summary -# ------- -# - TVM calls extern tensor function via :any:`te.extern` -# - Use contrib wrappers for short sugars of extern tensor calls. -# - We can hook front-end function as extern tensor callbacks. -# diff --git a/gallery/how_to/work_with_schedules/reduction.py b/gallery/how_to/work_with_schedules/reduction.py deleted file mode 100644 index 72c8d691a9e0..000000000000 --- a/gallery/how_to/work_with_schedules/reduction.py +++ /dev/null @@ -1,200 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Reduction -========= -**Author**: `Tianqi Chen `_ - -This is an introduction material on how to do reduction in TVM. -Associative reduction operators like sum/max/min are typical -construction blocks of linear algebra operations. - -In this tutorial, we will demonstrate how to do reduction in TVM. -""" -from __future__ import absolute_import, print_function - - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import tvm -import tvm.testing -from tvm import te -import numpy as np - -###################################################################### -# Describe Sum of Rows -# -------------------- -# Assume we want to compute sum of rows as our example. -# In numpy semantics this can be written as :code:`B = numpy.sum(A, axis=1)` -# -# The following lines describe the row sum operation. -# To create a reduction formula, we declare a reduction axis using -# :any:`te.reduce_axis`. :any:`te.reduce_axis` takes in the range of reductions. -# :any:`te.sum` takes in the expression to be reduced as well as the reduction -# axis and compute the sum of value over all k in the declared range. -# -# The equivalent C code is as follows: -# -# .. code-block:: c -# -# for (int i = 0; i < n; ++i) { -# B[i] = 0; -# for (int k = 0; k < m; ++k) { -# B[i] = B[i] + A[i][k]; -# } -# } -# -n = te.var("n") -m = te.var("m") -A = te.placeholder((n, m), name="A") -k = te.reduce_axis((0, m), "k") -B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B") - -###################################################################### -# Schedule the Reduction -# ---------------------- -# There are several ways to schedule a reduction. -# Before doing anything, let us print out the IR code of default schedule. -# -s = te.create_schedule(B.op) -print(tvm.lower(s, [A, B], simple_mode=True)) - -###################################################################### -# You can find that the IR code is quite like the C code. -# The reduction axis is similar to a normal axis, it can be splitted. -# -# In the following code we split both the row axis of B as well -# axis by different factors. The result is a nested reduction. -# -ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) -xo, xi = s[B].split(B.op.axis[0], factor=32) -print(tvm.lower(s, [A, B], simple_mode=True)) - -###################################################################### -# If we are building a GPU kernel, we can bind the rows of B to GPU threads. -s[B].bind(xo, te.thread_axis("blockIdx.x")) -s[B].bind(xi, te.thread_axis("threadIdx.x")) -print(tvm.lower(s, [A, B], simple_mode=True)) - -###################################################################### -# Reduction Factoring and Parallelization -# --------------------------------------- -# One problem of building a reduction is that we cannot simply -# parallelize over the reduction axis. We need to divide the computation -# of the reduction, store the local reduction result in a temporal array -# before doing a reduction over the temp array. -# -# The rfactor primitive does such rewrite of the computation. -# In the following schedule, the result of B is written to a temporary -# result B.rf. The factored dimension becomes the first dimension of B.rf. -# -s = te.create_schedule(B.op) -ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) -BF = s.rfactor(B, ki) -print(tvm.lower(s, [A, B], simple_mode=True)) - -###################################################################### -# The scheduled operator of B also get rewritten to be sum over -# the first axis of reduced result of B.f -# -print(s[B].op.body) - -###################################################################### -# Cross Thread Reduction -# ---------------------- -# We can now parallelize over the factored axis. -# Here the reduction axis of B is marked to be a thread. -# TVM allows reduction axis to be marked as thread if it is the only -# axis in reduction and cross thread reduction is possible in the device. -# -# This is indeed the case after the factoring. -# We can directly compute BF at the reduction axis as well. -# The final generated kernel will divide the rows by blockIdx.x and threadIdx.y -# columns by threadIdx.x and finally do a cross thread reduction over threadIdx.x -# -xo, xi = s[B].split(s[B].op.axis[0], factor=32) -s[B].bind(xo, te.thread_axis("blockIdx.x")) -s[B].bind(xi, te.thread_axis("threadIdx.y")) -tx = te.thread_axis("threadIdx.x") -s[B].bind(s[B].op.reduce_axis[0], tx) -s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) -s[B].set_store_predicate(tx.var.equal(0)) -fcuda = tvm.build(s, [A, B], "cuda") -print(fcuda.imported_modules[0].get_source()) - -###################################################################### -# Verify the correctness of result kernel by comparing it to numpy. -# -nn = 128 -dev = tvm.cuda(0) -a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), dev) -b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), dev) -fcuda(a, b) -tvm.testing.assert_allclose(b.numpy(), np.sum(a.numpy(), axis=1), rtol=1e-4) - -###################################################################### -# Describe Convolution via 2D Reduction -# ------------------------------------- -# In TVM, we can describe convolution via 2D reduction in a simple way. -# Here is an example for 2D convolution with filter size = [3, 3] and strides = [1, 1]. -# -n = te.var("n") -Input = te.placeholder((n, n), name="Input") -Filter = te.placeholder((3, 3), name="Filter") -di = te.reduce_axis((0, 3), name="di") -dj = te.reduce_axis((0, 3), name="dj") -Output = te.compute( - (n - 2, n - 2), - lambda i, j: te.sum(Input[i + di, j + dj] * Filter[di, dj], axis=[di, dj]), - name="Output", -) -s = te.create_schedule(Output.op) -print(tvm.lower(s, [Input, Filter, Output], simple_mode=True)) - -###################################################################### -# .. _general-reduction: -# -# Define General Commutative Reduction Operation -# ---------------------------------------------- -# Besides the built-in reduction operations like :any:`te.sum`, -# :any:`tvm.te.min` and :any:`tvm.te.max`, you can also define your -# commutative reduction operation by :any:`te.comm_reducer`. -# - -n = te.var("n") -m = te.var("m") -product = te.comm_reducer(lambda x, y: x * y, lambda t: tvm.tir.const(1, dtype=t), name="product") -A = te.placeholder((n, m), name="A") -k = te.reduce_axis((0, m), name="k") -B = te.compute((n,), lambda i: product(A[i, k], axis=k), name="B") - -###################################################################### -# .. note:: -# -# Sometimes we would like to perform reduction that involves multiple -# values like :code:`argmax`, which can be done by tuple inputs. -# See :ref:`reduction-with-tuple-inputs` for more detail. - -###################################################################### -# Summary -# ------- -# This tutorial provides a walk through of reduction schedule. -# -# - Describe reduction with reduce_axis. -# - Use rfactor to factor out axis if we need parallelism. -# - Define new reduction operation by :any:`te.comm_reducer` diff --git a/gallery/how_to/work_with_schedules/scan.py b/gallery/how_to/work_with_schedules/scan.py deleted file mode 100644 index 4c5ce94e0121..000000000000 --- a/gallery/how_to/work_with_schedules/scan.py +++ /dev/null @@ -1,158 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Scan and Recurrent Kernel -========================= -**Author**: `Tianqi Chen `_ - -This is an introduction material on how to do recurrent computing in TVM. -Recurrent computing is a typical pattern in neural networks. -""" -from __future__ import absolute_import, print_function - - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import tvm -import tvm.testing -from tvm import te -import numpy as np - -###################################################################### -# TVM supports a scan operator to describe symbolic loop. -# The following scan op computes cumsum over columns of X. -# -# The scan is carried over the highest dimension of the tensor. -# :code:`s_state` is a placeholder that describes the transition state of the scan. -# :code:`s_init` describes how we can initialize the first k timesteps. -# Here since s_init's first dimension is 1, it describes how we initialize -# The state at first timestep. -# -# :code:`s_update` describes how to update the value at timestep t. The update -# value can refer back to the values of previous timestep via state placeholder. -# Note that while it is invalid to refer to :code:`s_state` at current or later timestep. -# -# The scan takes in state placeholder, initial value and update description. -# It is also recommended(although not necessary) to list the inputs to the scan cell. -# The result of the scan is a tensor, giving the result of :code:`s_state` after the -# update over the time domain. -# -m = te.var("m") -n = te.var("n") -X = te.placeholder((m, n), name="X") -s_state = te.placeholder((m, n)) -s_init = te.compute((1, n), lambda _, i: X[0, i]) -s_update = te.compute((m, n), lambda t, i: s_state[t - 1, i] + X[t, i]) -s_scan = tvm.te.scan(s_init, s_update, s_state, inputs=[X]) - -###################################################################### -# Schedule the Scan Cell -# ---------------------- -# We can schedule the body of the scan by scheduling the update and -# init part separately. Note that it is invalid to schedule the -# first iteration dimension of the update part. -# To split on the time iteration, user can schedule on scan_op.scan_axis instead. -# -s = te.create_schedule(s_scan.op) -num_thread = 256 -block_x = te.thread_axis("blockIdx.x") -thread_x = te.thread_axis("threadIdx.x") -xo, xi = s[s_init].split(s_init.op.axis[1], factor=num_thread) -s[s_init].bind(xo, block_x) -s[s_init].bind(xi, thread_x) -xo, xi = s[s_update].split(s_update.op.axis[1], factor=num_thread) -s[s_update].bind(xo, block_x) -s[s_update].bind(xi, thread_x) -print(tvm.lower(s, [X, s_scan], simple_mode=True)) - -###################################################################### -# Build and Verify -# ---------------- -# We can build the scan kernel like other TVM kernels, here we use -# numpy to verify the correctness of the result. -# -fscan = tvm.build(s, [X, s_scan], "cuda", name="myscan") -dev = tvm.cuda(0) -n = 1024 -m = 10 -a_np = np.random.uniform(size=(m, n)).astype(s_scan.dtype) -a = tvm.nd.array(a_np, dev) -b = tvm.nd.array(np.zeros((m, n), dtype=s_scan.dtype), dev) -fscan(a, b) -tvm.testing.assert_allclose(b.numpy(), np.cumsum(a_np, axis=0)) - -###################################################################### -# Multi-Stage Scan Cell -# --------------------- -# In the above example we described the scan cell using one Tensor -# computation stage in s_update. It is possible to use multiple -# Tensor stages in the scan cell. -# -# The following lines demonstrate a scan with two stage operations -# in the scan cell. -# -m = te.var("m") -n = te.var("n") -X = te.placeholder((m, n), name="X") -s_state = te.placeholder((m, n)) -s_init = te.compute((1, n), lambda _, i: X[0, i]) -s_update_s1 = te.compute((m, n), lambda t, i: s_state[t - 1, i] * 2, name="s1") -s_update_s2 = te.compute((m, n), lambda t, i: s_update_s1[t, i] + X[t, i], name="s2") -s_scan = tvm.te.scan(s_init, s_update_s2, s_state, inputs=[X]) - -###################################################################### -# These intermediate tensors can also be scheduled normally. -# To ensure correctness, TVM creates a group constraint to forbid -# the body of scan to be compute_at locations outside the scan loop. -# -s = te.create_schedule(s_scan.op) -xo, xi = s[s_update_s2].split(s_update_s2.op.axis[1], factor=32) -s[s_update_s1].compute_at(s[s_update_s2], xo) -print(tvm.lower(s, [X, s_scan], simple_mode=True)) - -###################################################################### -# Multiple States -# --------------- -# For complicated applications like RNN, we might need more than one -# recurrent state. Scan support multiple recurrent states. -# The following example demonstrates how we can build recurrence with two states. -# -m = te.var("m") -n = te.var("n") -l = te.var("l") -X = te.placeholder((m, n), name="X") -s_state1 = te.placeholder((m, n)) -s_state2 = te.placeholder((m, l)) -s_init1 = te.compute((1, n), lambda _, i: X[0, i]) -s_init2 = te.compute((1, l), lambda _, i: 0.0) -s_update1 = te.compute((m, n), lambda t, i: s_state1[t - 1, i] + X[t, i]) -s_update2 = te.compute((m, l), lambda t, i: s_state2[t - 1, i] + s_state1[t - 1, 0]) -s_scan1, s_scan2 = tvm.te.scan( - [s_init1, s_init2], [s_update1, s_update2], [s_state1, s_state2], inputs=[X] -) -s = te.create_schedule(s_scan1.op) -print(tvm.lower(s, [X, s_scan1, s_scan2], simple_mode=True)) - -###################################################################### -# Summary -# ------- -# This tutorial provides a walk through of scan primitive. -# -# - Describe scan with init and update. -# - Schedule the scan cells as normal schedule. -# - For complicated workload, use multiple states and steps in scan cell. diff --git a/gallery/how_to/work_with_schedules/schedule_primitives.py b/gallery/how_to/work_with_schedules/schedule_primitives.py deleted file mode 100644 index a5c542df548b..000000000000 --- a/gallery/how_to/work_with_schedules/schedule_primitives.py +++ /dev/null @@ -1,211 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _schedule_primitives: - -Schedule Primitives in TVM -========================== -**Author**: `Ziheng Jiang `_ - -TVM is a domain specific language for efficient kernel construction. - -In this tutorial, we will show you how to schedule the computation by -various primitives provided by TVM. -""" -from __future__ import absolute_import, print_function - - -import tvm -from tvm import te -import numpy as np - -###################################################################### -# -# There often exist several methods to compute the same result, -# however, different methods will result in different locality and -# performance. So TVM asks user to provide how to execute the -# computation called **Schedule**. -# -# A **Schedule** is a set of transformation of computation that -# transforms the loop of computations in the program. -# - -# declare some variables for use later -n = te.var("n") -m = te.var("m") - -###################################################################### -# A schedule can be created from a list of ops, by default the -# schedule computes tensor in a serial manner in a row-major order. - -# declare a matrix element-wise multiply -A = te.placeholder((m, n), name="A") -B = te.placeholder((m, n), name="B") -C = te.compute((m, n), lambda i, j: A[i, j] * B[i, j], name="C") - -s = te.create_schedule([C.op]) -# lower will transform the computation from definition to the real -# callable function. With argument `simple_mode=True`, it will -# return you a readable C like statement, we use it here to print the -# schedule result. -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -###################################################################### -# One schedule is composed by multiple stages, and one -# **Stage** represents schedule for one operation. We provide various -# methods to schedule every stage. - -###################################################################### -# split -# ----- -# :code:`split` can split a specified axis into two axes by -# :code:`factor`. -A = te.placeholder((m,), name="A") -B = te.compute((m,), lambda i: A[i] * 2, name="B") - -s = te.create_schedule(B.op) -xo, xi = s[B].split(B.op.axis[0], factor=32) -print(tvm.lower(s, [A, B], simple_mode=True)) - -###################################################################### -# You can also split a axis by :code:`nparts`, which splits the axis -# contrary with :code:`factor`. -A = te.placeholder((m,), name="A") -B = te.compute((m,), lambda i: A[i], name="B") - -s = te.create_schedule(B.op) -bx, tx = s[B].split(B.op.axis[0], nparts=32) -print(tvm.lower(s, [A, B], simple_mode=True)) - -###################################################################### -# tile -# ---- -# :code:`tile` help you execute the computation tile by tile over two -# axes. -A = te.placeholder((m, n), name="A") -B = te.compute((m, n), lambda i, j: A[i, j], name="B") - -s = te.create_schedule(B.op) -xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5) -print(tvm.lower(s, [A, B], simple_mode=True)) - -###################################################################### -# fuse -# ---- -# :code:`fuse` can fuse two consecutive axes of one computation. -A = te.placeholder((m, n), name="A") -B = te.compute((m, n), lambda i, j: A[i, j], name="B") - -s = te.create_schedule(B.op) -# tile to four axes first: (i.outer, j.outer, i.inner, j.inner) -xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5) -# then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused) -fused = s[B].fuse(xi, yi) -print(tvm.lower(s, [A, B], simple_mode=True)) - -###################################################################### -# reorder -# ------- -# :code:`reorder` can reorder the axes in the specified order. -A = te.placeholder((m, n), name="A") -B = te.compute((m, n), lambda i, j: A[i, j], name="B") - -s = te.create_schedule(B.op) -# tile to four axes first: (i.outer, j.outer, i.inner, j.inner) -xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5) -# then reorder the axes: (i.inner, j.outer, i.outer, j.inner) -s[B].reorder(xi, yo, xo, yi) -print(tvm.lower(s, [A, B], simple_mode=True)) - -###################################################################### -# bind -# ---- -# :code:`bind` can bind a specified axis with a thread axis, often used -# in gpu programming. -A = te.placeholder((n,), name="A") -B = te.compute(A.shape, lambda i: A[i] * 2, name="B") - -s = te.create_schedule(B.op) -bx, tx = s[B].split(B.op.axis[0], factor=64) -s[B].bind(bx, te.thread_axis("blockIdx.x")) -s[B].bind(tx, te.thread_axis("threadIdx.x")) -print(tvm.lower(s, [A, B], simple_mode=True)) - -###################################################################### -# compute_at -# ---------- -# For a schedule that consists of multiple operators, TVM will compute -# tensors at the root separately by default. -A = te.placeholder((m,), name="A") -B = te.compute((m,), lambda i: A[i] + 1, name="B") -C = te.compute((m,), lambda i: B[i] * 2, name="C") - -s = te.create_schedule(C.op) -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -###################################################################### -# :code:`compute_at` can move computation of `B` into the first axis -# of computation of `C`. -A = te.placeholder((m,), name="A") -B = te.compute((m,), lambda i: A[i] + 1, name="B") -C = te.compute((m,), lambda i: B[i] * 2, name="C") - -s = te.create_schedule(C.op) -s[B].compute_at(s[C], C.op.axis[0]) -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -###################################################################### -# compute_inline -# -------------- -# :code:`compute_inline` can mark one stage as inline, then the body of -# computation will be expanded and inserted at the address where the -# tensor is required. -A = te.placeholder((m,), name="A") -B = te.compute((m,), lambda i: A[i] + 1, name="B") -C = te.compute((m,), lambda i: B[i] * 2, name="C") - -s = te.create_schedule(C.op) -s[B].compute_inline() -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -###################################################################### -# compute_root -# ------------ -# :code:`compute_root` can move computation of one stage to the root. -A = te.placeholder((m,), name="A") -B = te.compute((m,), lambda i: A[i] + 1, name="B") -C = te.compute((m,), lambda i: B[i] * 2, name="C") - -s = te.create_schedule(C.op) -s[B].compute_at(s[C], C.op.axis[0]) -s[B].compute_root() -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -###################################################################### -# Summary -# ------- -# This tutorial provides an introduction to schedule primitives in -# tvm, which permits users schedule the computation easily and -# flexibly. -# -# In order to get a good performance kernel implementation, the -# general workflow often is: -# -# - Describe your computation via series of operations. -# - Try to schedule the computation with primitives. -# - Compile and run to see the performance difference. -# - Adjust your schedule according the running result. diff --git a/gallery/how_to/work_with_schedules/tedd.py b/gallery/how_to/work_with_schedules/tedd.py deleted file mode 100644 index 7d7f8f149002..000000000000 --- a/gallery/how_to/work_with_schedules/tedd.py +++ /dev/null @@ -1,161 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Use Tensor Expression Debug Display (TEDD) for Visualization -============================================================ -**Author**: `Yongfeng Gu `_ - -This is an introduction about using TEDD to visualize tensor expressions. - -Tensor Expressions are scheduled with primitives. Although individual -primitives are usually easy to understand, they become complicated quickly -when you put them together. We have introduced an operational model of -schedule primitives in Tensor Expression. - -* the interactions between different schedule primitives, -* the impact of the schedule primitives on the final code generation. - -The operational model is based on a Dataflow Graph, a Schedule Tree and an -IterVar Relationship Graph. Schedule primitives perform operations on these -graphs. - -TEDD renders these three graphs from a given schedule. This tutorial demonstrates -how to use TEDD and how to interpret the rendered graphs. - -""" - -import tvm -from tvm import te -from tvm import topi -from tvm.contrib import tedd - -###################################################################### -# Define and Schedule Convolution with Bias and ReLU -# -------------------------------------------------- -# Let's build an example Tensor Expression for a convolution followed by Bias and ReLU. -# We first connect conv2d, add, and relu TOPIs. Then, we create a TOPI generic schedule. -# - -batch = 1 -in_channel = 256 -in_size = 32 -num_filter = 256 -kernel = 3 -stride = 1 -padding = "SAME" -dilation = 1 - -A = te.placeholder((in_size, in_size, in_channel, batch), name="A") -W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W") -B = te.placeholder((1, num_filter, 1), name="bias") - -with tvm.target.Target("llvm"): - t_conv = topi.nn.conv2d_hwcn(A, W, stride, padding, dilation) - t_bias = topi.add(t_conv, B) - t_relu = topi.nn.relu(t_bias) - s = topi.generic.schedule_conv2d_hwcn([t_relu]) - -###################################################################### -# Render Graphs with TEDD -# ----------------------- -# We render graphs to see the computation -# and how it is scheduled. -# If you run the tutorial in a Jupyter notebook, you can use the following commented lines -# to render SVG figures showing in notebook directly. -# - -tedd.viz_dataflow_graph(s, dot_file_path="/tmp/dfg.dot") -# tedd.viz_dataflow_graph(s, show_svg = True) - -###################################################################### -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tedd_dfg.png -# :align: center -# -# The first one is a dataflow graph. Every node represents a stage with name and memory -# scope shown in the middle and inputs/outputs information on the sides. -# Edges show nodes' dependency. -# - -tedd.viz_schedule_tree(s, dot_file_path="/tmp/scheduletree.dot") -# tedd.viz_schedule_tree(s, show_svg = True) - -###################################################################### -# We just rendered the schedule tree graph. You may notice an warning about ranges not -# available. -# The message also suggests to call normalize() to infer range information. We will -# skip inspecting the first schedule tree and encourage you to compare the graphs before -# and after normalize() for its impact. -# - -s = s.normalize() -tedd.viz_schedule_tree(s, dot_file_path="/tmp/scheduletree2.dot") -# tedd.viz_schedule_tree(s, show_svg = True) - -###################################################################### -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tedd_st.png -# :align: center -# -# Now, let us take a close look at the second schedule tree. Every block under ROOT -# represents a -# stage. Stage name shows in the top row and compute shows in the bottom row. -# The middle rows are for IterVars, the higher the outer, the lower the inner. -# An IterVar row contains its index, name, type, and other optional information. -# Let's use the W.shared stage as an example. The top row tells -# its name, "W.shared", and memory scope, "Shared". Its compute is -# :code:`W(ax0, ax1, ax2, ax3)`. -# Its outer most loop IterVar is ax0.ax1.fused.ax2.fused.ax3.fused.outer, -# indexed with 0, of kDataPar, bound to threadIdx.y, and with range(min=0, ext=8). -# You can also tell -# IterVar type with the index box color, shown in the legend. -# -# If a stage doesn't compute_at any other stage, it has an edge directly to the -# ROOT node. Otherwise, it has an edge pointing to the IterVar it attaches to, -# such as W.shared attaches to rx.outer in the middle compute stage. -# - -###################################################################### -# .. note:: -# -# By definition, IterVars are internal nodes and computes are leaf nodes in -# a schedule tree. The edges among IterVars and compute within one stage are -# omitted, making every stage a block, for better readability. -# - -tedd.viz_itervar_relationship_graph(s, dot_file_path="/tmp/itervar.dot") -# tedd.viz_itervar_relationship_graph(s, show_svg = True) - -###################################################################### -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tedd_itervar_rel.png -# :align: center -# -# The last one is an IterVar Relationship Graph. Every subgraph represents a -# stage and contains IterVar nodes and transformation nodes. For example, -# W.shared has three split nodes and three fuse nodes. The rest are IterVar -# nodes of the same format as the IterVar rows in Schedule Trees. Root -# IterVars are those not driven by any transformation node, such as ax0; leaf -# IterVars don't drive any transformation node and have non-negative indices, -# such as ax0.ax1.fused.ax2.fused.ax3.fused.outer with index of 0. -# - - -###################################################################### -# Summary -# ------- -# This tutorial demonstrates the usage of TEDD. We use an example built -# with TOPI to show the schedules under the hood. You can also use -# it before and after any schedule primitive to inspect its effect. -# diff --git a/gallery/how_to/work_with_schedules/tensorize.py b/gallery/how_to/work_with_schedules/tensorize.py deleted file mode 100644 index 8f7035511e1d..000000000000 --- a/gallery/how_to/work_with_schedules/tensorize.py +++ /dev/null @@ -1,317 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorials-tensorize: - -Use Tensorize to Leverage Hardware Intrinsics -============================================= -**Author**: `Yizhi Liu `_ - -This is an introduction material on how to perform tensorization in TVM. - -By using schedule primitive :code:`tensorize`, -people can replace a unit of computation with the corresponding intrinsics, -making it easy to leverage handcrafted micro-kernels, -as well as extend TVM to support new hardware architectures. - -The purpose of this tutorial is to show the functionality -and usage of tensorize instead of providing an efficient solution. - -""" -from __future__ import absolute_import, print_function - - -import tvm -from tvm import te -import tvm.testing -import numpy as np - -###################################################################### -# Define Matrix Multiplication -# ---------------------------- -# Take matrix multiplication as our example. -# Matmul first multiply the corresponding elements between two matrix, -# then accumulate across a certain axis. -# The following lines describe the computation :code:`A * B^T` in TVM. -# -N, M, L = 1024, 512, 64 -A = te.placeholder((N, L), name="A") -B = te.placeholder((M, L), name="B") -k = te.reduce_axis((0, L), name="k") -C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[j, k], axis=k), name="C") -s = te.create_schedule(C.op) -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -###################################################################### -# Schedule the Matmul -# ------------------- -# Now, suppose we have an accelerator that supports -# matrix-vector multiplication (GEMV) as a hardware primitive, -# which can take arbitrary size of reduce axis, -# but another axis needs to be no larger than 16. -# Thus we break down the matmul loops to make the innermost loops a (16x64) GEMV. -# -factor = 16 -x, y = C.op.axis -(z,) = C.op.reduce_axis -yo, yi = s[C].split(y, factor=factor) -s[C].reorder(x, yo, yi, z) -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -###################################################################### -# As showed in the IR printed above, -# the inner loops :code:`j.inner` along with :code:`k` together form a computation of GEMV -# - within the inner most two loops, the index :code:`i` is fixed, -# the access to the matrix :code:`A` only varies by :code:`k`, -# which makes the access pattern of :code:`A` a "vector". -# In order to leverage our hypothetical hardware's GEMV instruction, -# we can tensorize over :code:`j.inner`. -# -# Define GEMV Tensorization Intrinsic -# ----------------------------------- -# Before scheduling the tensorization, we need to first define the intrinsic function for GEMV. -# It includes two parts, the first is a compute definition of GEMV. -# TVM uses it to match the computing pattern in the original Matmul schedule. -# The second is to specify how to execute GEMV on the device, -# which is done in :code:`intrin_func` below. -# -def intrin_gemv(m, l): - a = te.placeholder((l,), name="a") - b = te.placeholder((m, l), name="b") - k = te.reduce_axis((0, l), name="k") - c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name="c") - Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1]) - Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[te.var("s1"), 1]) - Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1]) - - def intrin_func(ins, outs): - ib = tvm.tir.ir_builder.create() - aa, bb = ins - cc = outs[0] - ib.emit( - tvm.tir.call_extern( - "int32", - "gemv_update", - cc.access_ptr("w"), - aa.access_ptr("r"), - bb.access_ptr("r"), - m, - l, - bb.strides[0], - ) - ) - return ib.get() - - return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb}) - - -###################################################################### -# Here :code:`te.decl_tensor_intrin` declares how to execute the computation :code:`c.op`. -# Our implementation simply takes the inputs and outputs, -# converts them to pointers and emit an external function call. -# Note that tensorization requires user to specify :code:`offset_factor`, -# with this information, TVM has knowledge of whether the data is aligned -# between the start address of the original data structure -# and the offset being passed to tensorize, -# so that it has chance to optimize with vectorized loading. -# We set the factor to 1 for simplification. -# -# Buffers are also declared for inputs and outputs, though this is not required, -# we benefit from the extra information provided by buffers. For example, we pass -# :code:`bb.strides[0]` as an argument to the external function :code:`gemv_update`. -# For now :code:`bb.strides[0] == l`, -# but later we will see how they can differ with more complicated schedules. -# -# Note that we use :code:`te.var("s1")` as the first stride dimension for :code:`B`. -# If the strides can be inferred -# - in this case, TVM knows tensor B is compact thus the strides are :code:`[L, 1]` - -# such placeholder can be put to let TVM automatically bind the inferred value for us. -# -gemv = intrin_gemv(factor, L) -s[C].tensorize(yi, gemv) -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -###################################################################### -# By tensorizing over :code:`yi`, the inner most two loops are -# now replaced by the intrinsic function we defined before. -# In order to build and run the module, let's define the external function :code:`gemv_update`, -# it is a naive implementation of GEMV, just for demonstration. -# -def gemv_impl(): - cc_code = """ - extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) { - for (int i = 0; i < m; ++i) { - for (int j = 0; j < l; ++j) { - cc[i] += aa[j] * bb[i * stride + j]; - } - } - return 0; - } - """ - from tvm.contrib import utils, clang - - temp = utils.tempdir() - ll_path = temp.relpath("temp.ll") - # Create LLVM ir from c source code - ll_code = clang.create_llvm(cc_code, output=ll_path) - return ll_code - - -###################################################################### -# Now we leverage the pragma attribute :code:`import_llvm` to import llvm asm inline. -# The importing needs to happen before the tensorized GEMV being executed. -# -s[C].pragma(x, "import_llvm", gemv_impl()) -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -###################################################################### -# Finally we compare the tensorize version with that :code:`numpy.dot` produces, -# ensure our implementation is correct. -# -func = tvm.build(s, [A, B, C], target="llvm", name="gemv") - -from tvm.topi.utils import get_const_tuple - -dtype = A.dtype -dev = tvm.device("cpu", 0) -a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype) -b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype) -c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), dev) -func(tvm.nd.array(a, dev), tvm.nd.array(b, dev), c) -tvm.testing.assert_allclose(c.numpy(), np.dot(a, b.T), rtol=1e-3) - -###################################################################### -# Reduce-update for Tensorize -# --------------------------- -# So far you have learned the basic idea of tensorize, -# now let's move one step forward to a more complicated case. -# -# Assume our accelerator could only multiply a vector by a square matrix, -# in which the vector size needs to be no larger than 16. -# Given such hardware constrain, now we need to split the reduce axis as following, -# -zo, zi = s[C].split(z, factor=factor) -s[C].reorder(x, yo, zo, yi, zi) - -###################################################################### -# However, since the tensorize intrinsic now only covers a part of the reduce axis, -# instead of using one "body" function, TVM requires a :code:`reduce_reset` function, -# which will be invoked before the reduce for-loop, and a :code:`reduce_update` function, -# which defines the "update" computing strategy. -# -def gemv_impl(): - cc_code = """ - extern "C" int gemv_update(float *cc, float *aa, float *bb, int m, int l, int stride) { - for (int i = 0; i < m; ++i) { - for (int j = 0; j < l; ++j) { - cc[i] += aa[j] * bb[i * stride + j]; - } - } - return 0; - } - extern "C" int gemv_reset(float *cc, int m) { - for (int i = 0; i < m; ++i) { - cc[i] = 0.0; - } - return 0; - } - """ - from tvm.contrib import utils, clang - - temp = utils.tempdir() - ll_path = temp.relpath("temp.ll") - # Create LLVM ir from c source code - ll_code = clang.create_llvm(cc_code, output=ll_path) - return ll_code - - -def intrin_gemv(m, l): - a = te.placeholder((l,), name="a") - b = te.placeholder((m, l), name="b") - k = te.reduce_axis((0, l), name="k") - c = te.compute((m,), lambda i: te.sum(a[k] * b[i, k], axis=k), name="c") - Ab = tvm.tir.decl_buffer(a.shape, a.dtype, name="A", offset_factor=1, strides=[1]) - Bb = tvm.tir.decl_buffer(b.shape, b.dtype, name="B", offset_factor=1, strides=[te.var("s1"), 1]) - Cb = tvm.tir.decl_buffer(c.shape, c.dtype, name="C", offset_factor=1, strides=[1]) - - def intrin_func(ins, outs): - aa, bb = ins - cc = outs[0] - - def _body(): - ib = tvm.tir.ir_builder.create() - ib.emit( - tvm.tir.call_extern( - "int32", - "gemv_update", - cc.access_ptr("w"), - aa.access_ptr("r"), - bb.access_ptr("r"), - m, - l, - bb.strides[0], - ) - ) - return ib.get() - - def _reduce_reset(): - ib = tvm.tir.ir_builder.create() - ib.emit(tvm.tir.call_extern("int32", "gemv_reset", cc.access_ptr("w"), m)) - return ib.get() - - def _reduce_update(): - return _body() - - return _body(), _reduce_reset(), _reduce_update() - - return te.decl_tensor_intrin(c.op, intrin_func, binds={a: Ab, b: Bb, c: Cb}) - - -###################################################################### -# Note that :code:`intrin_func` now returns a triplet: -# :code:`(body, reduce_reset, reduce_update)`. -# If tensorization includes all the reduce axes, function :code:`body()` will be invoked, -# otherwise :code:`reduce_reset()` and :code:`reduce_update()` together will be used. -# In our example :code:`body()` and :code:`reduce_update()` -# share the same implementation, -# while in other cases, hardware may have different instructions for these two functions. -# Moreover, we can see now :code:`bb.strides[0]` is different from :code:`l` -# due to the tiling. -# -# Tensorize for squared GEMV, build and check the results, -# -gemv = intrin_gemv(factor, factor) -s[C].tensorize(yi, gemv) -s[C].pragma(yo, "import_llvm", gemv_impl()) - -func = tvm.build(s, [A, B, C], target="llvm", name="gemv") -a = np.random.uniform(size=get_const_tuple(A.shape)).astype(dtype) -b = np.random.uniform(size=get_const_tuple(B.shape)).astype(dtype) -c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=dtype), dev) -func(tvm.nd.array(a, dev), tvm.nd.array(b, dev), c) -tvm.testing.assert_allclose(c.numpy(), np.dot(a, b.T), rtol=1e-3) - -###################################################################### -# Summary -# ------- -# This tutorial demonstrates the usage of tensorize intrinsic in TVM. -# Tensorize provides a way for users to get fully optimized schedule via micro-kernels. -# For example, INT8 quantization on Intel CPUs uses tensorization -# to invoke AVX instruction directly. -# We also demonstrates how to use inline assembly importing, -# which helps users inject asm easily into the schedule. -# diff --git a/gallery/how_to/work_with_schedules/tuple_inputs.py b/gallery/how_to/work_with_schedules/tuple_inputs.py deleted file mode 100644 index edf82ddca75b..000000000000 --- a/gallery/how_to/work_with_schedules/tuple_inputs.py +++ /dev/null @@ -1,123 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compute and Reduce with Tuple Inputs -======================================= -**Author**: `Ziheng Jiang `_ - -Often we want to compute multiple outputs with the same shape within -a single loop or perform reduction that involves multiple values like -:code:`argmax`. These problems can be addressed by tuple inputs. - -In this tutorial, we will introduce the usage of tuple inputs in TVM. -""" -from __future__ import absolute_import, print_function - - -import tvm -from tvm import te -import numpy as np - -###################################################################### -# Describe Batchwise Computation -# ------------------------------ -# For operators which have the same shape, we can put them together as -# the inputs of :any:`te.compute`, if we want them to be scheduled -# together in the next schedule procedure. -# -n = te.var("n") -m = te.var("m") -A0 = te.placeholder((m, n), name="A0") -A1 = te.placeholder((m, n), name="A1") -B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A1[i, j] * 3), name="B") - -# The generated IR code would be: -s = te.create_schedule(B0.op) -print(tvm.lower(s, [A0, A1, B0, B1], simple_mode=True)) - -###################################################################### -# .. _reduction-with-tuple-inputs: -# -# Describe Reduction with Collaborative Inputs -# -------------------------------------------- -# Sometimes, we require multiple inputs to express some reduction -# operators, and the inputs will collaborate together, e.g. :code:`argmax`. -# In the reduction procedure, :code:`argmax` need to compare the value of -# operands, also need to keep the index of operand. It can be expressed -# with :py:func:`te.comm_reducer` as below: - -# x and y are the operands of reduction, both of them is a tuple of index -# and value. -def fcombine(x, y): - lhs = tvm.tir.Select((x[1] >= y[1]), x[0], y[0]) - rhs = tvm.tir.Select((x[1] >= y[1]), x[1], y[1]) - return lhs, rhs - - -# our identity element also need to be a tuple, so `fidentity` accepts -# two types as inputs. -def fidentity(t0, t1): - return tvm.tir.const(-1, t0), tvm.te.min_value(t1) - - -argmax = te.comm_reducer(fcombine, fidentity, name="argmax") - -# describe the reduction computation -m = te.var("m") -n = te.var("n") -idx = te.placeholder((m, n), name="idx", dtype="int32") -val = te.placeholder((m, n), name="val", dtype="int32") -k = te.reduce_axis((0, n), "k") -T0, T1 = te.compute((m,), lambda i: argmax((idx[i, k], val[i, k]), axis=k), name="T") - -# the generated IR code would be: -s = te.create_schedule(T0.op) -print(tvm.lower(s, [idx, val, T0, T1], simple_mode=True)) - -###################################################################### -# .. note:: -# -# For ones who are not familiar with reduction, please refer to -# :ref:`general-reduction`. - -###################################################################### -# Schedule Operation with Tuple Inputs -# ------------------------------------ -# It is worth mentioning that although you will get multiple outputs -# with one batch operation, but they can only be scheduled together -# in terms of operation. - -n = te.var("n") -m = te.var("m") -A0 = te.placeholder((m, n), name="A0") -B0, B1 = te.compute((m, n), lambda i, j: (A0[i, j] + 2, A0[i, j] * 3), name="B") -A1 = te.placeholder((m, n), name="A1") -C = te.compute((m, n), lambda i, j: A1[i, j] + B0[i, j], name="C") - -s = te.create_schedule(C.op) -s[B0].compute_at(s[C], C.op.axis[0]) -# as you can see in the below generated IR code: -print(tvm.lower(s, [A0, A1, C], simple_mode=True)) - -###################################################################### -# Summary -# ------- -# This tutorial introduces the usage of tuple inputs operation. -# -# - Describe normal batchwise computation. -# - Describe reduction operation with tuple inputs. -# - Notice that you can only schedule computation in terms of operation instead of tensor. diff --git a/gallery/tutorial/README.txt b/gallery/tutorial/README.txt deleted file mode 100644 index ceac645d7dc1..000000000000 --- a/gallery/tutorial/README.txt +++ /dev/null @@ -1,10 +0,0 @@ -User Tutorial -------------- - -This tutorial provides an introduction to TVM, meant to address user who is new -to the TVM project. It begins with some basic information on how TVM works, -then works through installing TVM, compiling and optimizing models, then -digging in deeper to the Tensor Expression language and the tuning and -optimization tools that are built on top of it. After completing the tutorial, -a new user should be familiar enough with TVM to optimize models, and will be -prepared to dig into TVM more deeply. diff --git a/gallery/tutorial/auto_scheduler_matmul_x86.py b/gallery/tutorial/auto_scheduler_matmul_x86.py deleted file mode 100644 index 14f8040bf851..000000000000 --- a/gallery/tutorial/auto_scheduler_matmul_x86.py +++ /dev/null @@ -1,215 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Optimizing Operators with Auto-scheduling -========================================= -**Author**: `Lianmin Zheng `_, \ - `Chengfan Jia `_ - -In this tutorial, we will show how TVM's Auto Scheduling feature can find -optimal schedules without the need for writing a custom template. - -Different from the template-based :doc:`AutoTVM ` which relies on -manual templates to define the search space, the auto-scheduler does not -require any templates. Users only need to write the computation declaration -without any schedule commands or templates. The auto-scheduler can -automatically generate a large search space and find a good schedule in the -space. - -We use matrix multiplication as an example in this tutorial. - -.. note:: - Note that this tutorial will not run on Windows or recent versions of macOS. To - get it to run, you will need to wrap the body of this tutorial in a :code:`if - __name__ == "__main__":` block. -""" - - -import numpy as np -import tvm -from tvm import te, auto_scheduler - -################################################################################ -# Defining the Matrix Multiplication -# ---------------------------------- -# To start, we define a matrix multiplication with a bias addition. Note that -# this uses standard operations available in TVMs Tensor Expression language. -# The major difference is the use of the :any:`register_workload` decorator at the top -# of the function definition. The function should return a list of -# input/output tensors. From these tensors, the auto-scheduler can get the -# whole computational graph. - - -@auto_scheduler.register_workload # Note the auto_scheduler decorator -def matmul_add(N, L, M, dtype): - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - C = te.placeholder((N, M), name="C", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - matmul = te.compute( - (N, M), - lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), - name="matmul", - attrs={"layout_free_placeholders": [B]}, # enable automatic layout transform for tensor B - ) - out = te.compute((N, M), lambda i, j: matmul[i, j] + C[i, j], name="out") - - return [A, B, C, out] - - -################################################################################ -# Create the search task -# ---------------------- -# With the function defined, we can now create the task for the auto_scheduler -# to search against. We specify the particular parameters for this matrix -# multiplication, in this case a multiplication of two square matrices of size -# 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32" -# -# .. admonition:: Improve performance with custom targets -# -# In order for TVM to take full advantage of specific hardware platforms, -# you will want to manually specify your CPU capabilities. For example: -# -# - replace ``llvm`` below with ``llvm -mcpu=core-avx2`` to enable AVX2 -# - replace ``llvm`` below with ``llvm -mcpu=skylake-avx512`` to enable AVX-512 - -target = tvm.target.Target("llvm") -N = L = M = 1024 -task = tvm.auto_scheduler.SearchTask(func=matmul_add, args=(N, L, M, "float32"), target=target) - -# Inspect the computational graph -print("Computational DAG:") -print(task.compute_dag) - -################################################################################ -# Set Parameters for Auto-Scheduler -# --------------------------------- -# Next, we set parameters for the auto-scheduler. -# -# * :code:`num_measure_trials` is the number of measurement trials we can use -# during the search. We only make 10 trials in this tutorial for a fast -# demonstration. In practice, 1000 is a good value for the search to converge. -# You can do more trials according to your time budget. -# * In addition, we use :any:`RecordToFile ` to log measurement records into a -# file ``matmul.json``. The measurement records can be used to query the history -# best, resume the search, and do more analyses later. -# * see :any:`TuningOptions ` for more parameters - -log_file = "matmul.json" -tune_option = auto_scheduler.TuningOptions( - num_measure_trials=10, - measure_callbacks=[auto_scheduler.RecordToFile(log_file)], - verbose=2, -) - -################################################################################ -# Run the search -# -------------- -# Now we get all inputs ready. Pretty simple, isn't it? We can kick off the -# search and let the auto-scheduler do its magic. After some measurement -# trials, we can load the best schedule from the log file and apply it. - -# Run auto-tuning (search) -task.tune(tune_option) -# Apply the best schedule -sch, args = task.apply_best(log_file) - -################################################################################ -# Inspecting the Optimized Schedule -# --------------------------------- -# We can lower the schedule to see the IR after auto-scheduling. The -# auto-scheduler correctly performs optimizations including multi-level tiling, -# layout transformation, parallelization, vectorization, unrolling, and -# operator fusion. - -print("Lowered TIR:") -print(tvm.lower(sch, args, simple_mode=True)) - -################################################################################ -# Check correctness and evaluate performance -# ------------------------------------------ -# We build the binary and check its correctness and performance. - -func = tvm.build(sch, args, target) -a_np = np.random.uniform(size=(N, L)).astype(np.float32) -b_np = np.random.uniform(size=(L, M)).astype(np.float32) -c_np = np.random.uniform(size=(N, M)).astype(np.float32) -out_np = a_np.dot(b_np) + c_np - -dev = tvm.cpu() -a_tvm = tvm.nd.array(a_np, device=dev) -b_tvm = tvm.nd.array(b_np, device=dev) -c_tvm = tvm.nd.array(c_np, device=dev) -out_tvm = tvm.nd.empty(out_np.shape, device=dev) -func(a_tvm, b_tvm, c_tvm, out_tvm) - -# Check results -np.testing.assert_allclose(out_np, out_tvm.numpy(), rtol=1e-3) - -# Evaluate execution time. -evaluator = func.time_evaluator(func.entry_name, dev, min_repeat_ms=500) -print( - "Execution time of this operator: %.3f ms" - % (np.median(evaluator(a_tvm, b_tvm, c_tvm, out_tvm).results) * 1000) -) - - -################################################################################ -# Using the record file -# --------------------- -# During the search, all measurement records are logged into the record file -# ``matmul.json```. The measurement records can be used to re-apply search -# results, resume the search, and perform other analyses. -# -# Here is an example where we load the best schedule from a file, and print the -# equivalent python schedule API. This can be used for debugging and learning -# the behavior of the auto-scheduler. - -print("Equivalent python schedule:") -print(task.print_best(log_file)) - -################################################################################ -# A more complicated example is to resume the search. In this case, we need to -# create the search policy and cost model by ourselves and resume the status of -# search policy and cost model with the log file. In the example below we -# resume the status and do more 5 trials. - - -def resume_search(task, log_file): - print("Resume search:") - cost_model = auto_scheduler.XGBModel() - cost_model.update_from_file(log_file) - search_policy = auto_scheduler.SketchPolicy( - task, cost_model, init_search_callbacks=[auto_scheduler.PreloadMeasuredStates(log_file)] - ) - tune_option = auto_scheduler.TuningOptions( - num_measure_trials=5, measure_callbacks=[auto_scheduler.RecordToFile(log_file)] - ) - task.tune(tune_option, search_policy=search_policy) - - -resume_search(task, log_file) - -################################################################################ -# Final Notes and Summary -# ----------------------- -# In this tutorial, we have shown how to use the TVM Auto-Scheduler to -# automatically optimize a matrix multiplication, without the need to specify a -# search template. It ends a series of examples that starts from the Tensor -# Expression (TE) language that demonstrates how TVM can optimize computational -# operations. diff --git a/gallery/tutorial/autotvm_matmul_x86.py b/gallery/tutorial/autotvm_matmul_x86.py deleted file mode 100644 index a2e355c8ca8f..000000000000 --- a/gallery/tutorial/autotvm_matmul_x86.py +++ /dev/null @@ -1,378 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-autotvm-matmul-x86: - -Optimizing Operators with Schedule Templates and AutoTVM -======================================================== -**Authors**: -`Lianmin Zheng `_, -`Chris Hoge `_ - -In this tutorial, we show how the TVM Tensor Expression (TE) language -can be used to write schedule templates that can be searched by AutoTVM to -find the optimal schedule. This process is called Auto-Tuning, which helps -automate the process of optimizing tensor computation. - -This tutorial builds on the previous :doc:`tutorial on how to write a matrix -multiplication using TE `. - -There are two steps in auto-tuning. - -- The first step is defining a search space. -- The second step is running a search algorithm to explore through this space. - -In this tutorial, you can learn how to perform these two steps in TVM. The whole -workflow is illustrated by a matrix multiplication example. - -.. note:: - Note that this tutorial will not run on Windows or recent versions of macOS. - To get it to run, you will need to wrap the body of this tutorial in a - :code:`if __name__ == "__main__":` block. -""" - -################################################################################ -# Install dependencies -# -------------------- -# To use autotvm package in TVM, we need to install some extra dependencies. -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost cloudpickle -# -# To make TVM run faster in tuning, it is recommended to use cython as FFI of -# TVM. In the root directory of TVM, execute: -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Begin by importing the required packages. - - -import logging -import sys - -import numpy as np -import tvm -from tvm import te -import tvm.testing - -# the module is called `autotvm` -from tvm import autotvm - -################################################################################ -# Basic Matrix Multiplication with TE -# ----------------------------------- -# Recall the basic implementation of matrix multiplication using TE. We write -# it down here with a few changes. We will wrap the multiplication in a python -# function definition. For simplicity, we will focus our attention on a split -# optimization, using a fixed value that defines the block size of the -# reordering. - - -def matmul_basic(N, L, M, dtype): - - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") - s = te.create_schedule(C.op) - - # schedule - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - yo, yi = s[C].split(y, 8) - xo, xi = s[C].split(x, 8) - - s[C].reorder(yo, xo, k, yi, xi) - - return s, [A, B, C] - - -################################################################################ -# Matrix Multiplication with AutoTVM -# ---------------------------------- -# In the previous schedule code, we use a constant "8" as the tiling factor. -# However, it might not be the best one because the best tiling factor depends -# on real hardware environment and input shape. -# -# If you want the schedule code to be portable across a wider range of input -# shapes and target hardware, it is better to define a set of candidate values -# and pick the best one according to the measurement results on target -# hardware. -# -# In autotvm, we can define a tunable parameter, or a "knob" for such kind of -# value. - -################################################################################ -# A Basic Matrix Multiplication Template -# -------------------------------------- -# We begin with an example of how to create a tunable parameter set for the -# block size of the `split` scheduling operation. - -# Matmul V1: List candidate values -@autotvm.template("tutorial/matmul_v1") # 1. use a decorator -def matmul_v1(N, L, M, dtype): - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") - s = te.create_schedule(C.op) - - # schedule - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - # 2. get the config object - cfg = autotvm.get_config() - - # 3. define search space - cfg.define_knob("tile_y", [1, 2, 4, 8, 16]) - cfg.define_knob("tile_x", [1, 2, 4, 8, 16]) - - # 4. schedule according to config - yo, yi = s[C].split(y, cfg["tile_y"].val) - xo, xi = s[C].split(x, cfg["tile_x"].val) - - s[C].reorder(yo, xo, k, yi, xi) - - return s, [A, B, C] - - -################################################################################ -# Here we make four modifications to the previous schedule code and get a -# tunable "template". We can explain the modifications one by one. -# -# 1. Use a decorator to mark this function as a simple template. -# 2. Get a config object: You can regard this :code:`cfg` as an argument of -# this function but we obtain it in a different way. With this argument, this -# function is no longer a deterministic schedule. Instead, we can pass -# different configurations to this function and get different schedules. A -# function that uses a configuration object like this is called a "template". -# -# To make the template function more compact, we can do two things to define -# the parameter search space within a single function. -# -# 1. Define a search space across a set values. This is done by making -# :code:`cfg` a :any:`ConfigSpace` object. It will collect all of the -# tunable knobs in this function and build a search space from it. -# 2. Schedule according to an entity in this space. This is done by making -# :code:`cfg` a :any:`ConfigEntity` object. When it is a -# :any:`ConfigEntity`, it will ignore all space definition API (namely, -# :code:`cfg.define_XXXXX(...)`). Instead, it will store deterministic -# values for all tunable knobs, and we schedule according to these values. -# -# During auto-tuning, we will first call this template with a -# :any:`ConfigSpace` object to build the search space. Then we call this -# template with different :any:`ConfigEntity` in the built space to get -# different schedules. Finally we will measure the code generated by -# different schedules and pick the best one. -# -# 3. Define two tunable knobs. The first one is :code:`tile_y` with 5 possible -# values. The second one is :code:`tile_x` with a same list of possible values. -# These two knobs are independent, so they span a search space with size 25 = -# 5x5. -# 4. The configuration knobs are passed to the :code:`split` schedule -# operation, allowing us to schedule according to the 5x5 deterministic values -# we previously defined in :code:`cfg`. - -################################################################################ -# A Matrix Multiplication Template with the Advanced Parameter API -# ---------------------------------------------------------------- -# In the previous template, we manually listed all of the possible values for a -# knob. This is the lowest level API to define the space, and gives an explicit -# enumeration of the parameter space to search. However, we also provide -# another set of APIs that can make the definition of the search space easier -# and smarter. Where possible, we recommend you use this higher-level API -# -# In the following example, we use :any:`ConfigSpace.define_split` to define a -# split knob. It will enumerate all the possible ways to split an axis and -# construct the space. -# -# We also have :any:`ConfigSpace.define_reorder` for reorder knob and -# :any:`ConfigSpace.define_annotate` for annotation like unroll, vectorization, -# thread binding. When the high level API cannot meet your requirements, you -# can always fall back to using the low level API. - - -@autotvm.template("tutorial/matmul") -def matmul(N, L, M, dtype): - A = te.placeholder((N, L), name="A", dtype=dtype) - B = te.placeholder((L, M), name="B", dtype=dtype) - - k = te.reduce_axis((0, L), name="k") - C = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), name="C") - s = te.create_schedule(C.op) - - # schedule - y, x = s[C].op.axis - k = s[C].op.reduce_axis[0] - - ##### define space begin ##### - cfg = autotvm.get_config() - cfg.define_split("tile_y", y, num_outputs=2) - cfg.define_split("tile_x", x, num_outputs=2) - ##### define space end ##### - - # schedule according to config - yo, yi = cfg["tile_y"].apply(s, C, y) - xo, xi = cfg["tile_x"].apply(s, C, x) - - s[C].reorder(yo, xo, k, yi, xi) - - return s, [A, B, C] - - -################################################################################ -# .. admonition:: More Explanation on :code:`cfg.define_split` -# -# In this template, :code:`cfg.define_split("tile_y", y, num_outputs=2)` will -# enumerate all possible combinations that can split axis y into two axes with -# factors of the length of y. For example, if the length of y is 32 and we -# want to split it into two axes using factors of 32, then there are 6 -# possible values for (length of outer axis, length of inner axis) pair, -# namely (32, 1), (16, 2), (8, 4), (4, 8), (2, 16) or (1, 32). These are all 6 -# possible values of `tile_y`. -# -# During scheduling, :code:`cfg["tile_y"]` is a :code:`SplitEntity` object. -# We stores the lengths of outer axes and inner axes in -# :code:`cfg['tile_y'].size` (a tuple with two elements). In this template, -# we apply it by using :code:`yo, yi = cfg['tile_y'].apply(s, C, y)`. -# Actually, this is equivalent to :code:`yo, yi = s[C].split(y, -# cfg["tile_y"].size[1])` or :code:`yo, yi = s[C].split(y, -# nparts=cfg['tile_y"].size[0])` -# -# The advantage of using cfg.apply API is that it makes multi-level splits -# (that is, when num_outputs >= 3) easier. - -################################################################################ -# Step 2: Use AutoTVM to Optimize the Matrix Multiplication -# --------------------------------------------------------- -# In Step 1, we wrote a matrix multiplication template that allowed us to -# parameterize the block size used in the `split` schedule. We can now conduct -# a search over this parameter space. The next step is to pick a tuner to guide -# the exploration of this space. -# -# Auto-tuners in TVM -# ~~~~~~~~~~~~~~~~~~ -# The job for a tuner can be described by following pseudo code -# -# .. code-block:: c -# -# ct = 0 -# while ct < max_number_of_trials: -# propose a batch of configs -# measure this batch of configs on real hardware and get results -# ct += batch_size -# -# When proposing the next batch of configs, the tuner can take different -# strategies. Some of the tuner strategies provided by TVM include: -# -# * :any:`tvm.autotvm.tuner.RandomTuner`: Enumerate the space in a random order -# * :any:`tvm.autotvm.tuner.GridSearchTuner`: Enumerate the space in a grid search order -# * :any:`tvm.autotvm.tuner.GATuner`: Using genetic algorithm to search through the space -# * :any:`tvm.autotvm.tuner.XGBTuner`: Uses a model based method. Train a XGBoost model to -# predict the speed of lowered IR and pick the next batch according to the -# prediction. -# -# You can choose the tuner according to the size of your space, your time -# budget and other factors. For example, if your space is very small (less -# than 1000), a grid-search tuner or a random tuner is good enough. If your -# space is at the level of 10^9 (this is the space size of a conv2d operator on -# CUDA GPU), XGBoostTuner can explore more efficiently and find better configs. - -################################################################################ -# Begin tuning -# ~~~~~~~~~~~~ -# Here we continue our matrix multiplication example. First we create a tuning -# task. We can also inspect the initialized search space. In this case, for a -# 512x512 square matrix multiplication, the space size is 10x10=100 Note that -# the task and search space are independent of the tuner picked. - -N, L, M = 512, 512, 512 -task = autotvm.task.create("tutorial/matmul", args=(N, L, M, "float32"), target="llvm") -print(task.config_space) - -################################################################################ -# Then we need to define how to measure the generated code and pick a tuner. -# Since our space is small, a random tuner is just okay. -# -# We only make 10 trials in this tutorial for demonstration. In practice, you -# can do more trials according to your time budget. We will log the tuning -# results into a log file. This file can be used to choose the best -# configuration discovered by the tuner later. - -# logging config (for printing tuning log to the screen) -logging.getLogger("autotvm").setLevel(logging.DEBUG) -logging.getLogger("autotvm").addHandler(logging.StreamHandler(sys.stdout)) - -################################################################################ -# There are two steps for measuring a config: build and run. By default, we use -# all CPU cores to compile program. We then measure them sequentially. To help -# reduce variance, we take 5 measurements and average them. -measure_option = autotvm.measure_option(builder="local", runner=autotvm.LocalRunner(number=5)) - -# Begin tuning with RandomTuner, log records to file `matmul.log` -# You can use alternatives like XGBTuner. -tuner = autotvm.tuner.RandomTuner(task) -tuner.tune( - n_trial=10, - measure_option=measure_option, - callbacks=[autotvm.callback.log_to_file("matmul.log")], -) - -################################################################################ -# With tuning completed, we can choose the configuration from the log file that -# has the best measured performance and compile the schedule with the -# corresponding parameters. We also do a quick verification that the schedule is -# producing correct answers. We can call the function :code:`matmul` directly -# under the :any:`autotvm.apply_history_best` context. When we call this -# function, it will query the dispatch context with its argument and get the -# best config with the same argument. - -# apply history best from log file -with autotvm.apply_history_best("matmul.log"): - with tvm.target.Target("llvm"): - s, arg_bufs = matmul(N, L, M, "float32") - func = tvm.build(s, arg_bufs) - -# check correctness -a_np = np.random.uniform(size=(N, L)).astype(np.float32) -b_np = np.random.uniform(size=(L, M)).astype(np.float32) -c_np = a_np.dot(b_np) - -c_tvm = tvm.nd.empty(c_np.shape) -func(tvm.nd.array(a_np), tvm.nd.array(b_np), c_tvm) - -tvm.testing.assert_allclose(c_np, c_tvm.numpy(), rtol=1e-4) - -################################################################################ -# Final Notes and Summary -# ----------------------- -# In this tutorial, we have shown how to build operator templates that allow -# TVM to search a parameter space and choose optimized schedule configurations. -# To gain a deeper understanding of how this works, we recommend expanding on -# this example by adding new search parameters to the schedule based on -# schedule operations demonstrated in the :ref: `Getting Started With Tensor -# Expressions _` tutorial. In the upcoming sections, we -# will demonstrate the AutoScheduler, a method for TVM to optimize common -# operators without the need for the user to provide a user-defined template. diff --git a/gallery/tutorial/autotvm_relay_x86.py b/gallery/tutorial/autotvm_relay_x86.py deleted file mode 100644 index 894f317708f6..000000000000 --- a/gallery/tutorial/autotvm_relay_x86.py +++ /dev/null @@ -1,516 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compiling and Optimizing a Model with the Python Interface (AutoTVM) -==================================================================== -**Author**: -`Chris Hoge `_ - -In the `TVMC Tutorial `_, we covered how to compile, run, and tune a -pre-trained vision model, ResNet-50 v2 using the command line interface for -TVM, TVMC. TVM is more that just a command-line tool though, it is an -optimizing framework with APIs available for a number of different languages -that gives you tremendous flexibility in working with machine learning models. - -In this tutorial we will cover the same ground we did with TVMC, but show how -it is done with the Python API. Upon completion of this section, we will have -used the Python API for TVM to accomplish the following tasks: - -* Compile a pre-trained ResNet-50 v2 model for the TVM runtime. -* Run a real image through the compiled model, and interpret the output and model - performance. -* Tune the model that model on a CPU using TVM. -* Re-compile an optimized model using the tuning data collected by TVM. -* Run the image through the optimized model, and compare the output and model - performance. - -The goal of this section is to give you an overview of TVM's capabilites and -how to use them through the Python API. -""" - - -################################################################################ -# TVM is a deep learning compiler framework, with a number of different modules -# available for working with deep learning models and operators. In this -# tutorial we will work through how to load, compile, and optimize a model -# using the Python API. -# -# We begin by importing a number of dependencies, including ``onnx`` for -# loading and converting the model, helper utilities for downloading test data, -# the Python Image Library for working with the image data, ``numpy`` for pre -# and post-processing of the image data, the TVM Relay framework, and the TVM -# Graph Executor. - -import onnx -from tvm.contrib.download import download_testdata -from PIL import Image -import numpy as np -import tvm.relay as relay -import tvm -from tvm.contrib import graph_executor - -################################################################################ -# Downloading and Loading the ONNX Model -# -------------------------------------- -# -# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a -# convolutional neural network that is 50 layers deep and designed to classify -# images. The model we will be using has been pre-trained on more than a -# million images with 1000 different classifications. The network has an input -# image size of 224x224. If you are interested exploring more of how the -# ResNet-50 model is structured, we recommend downloading -# `Netron `_, a freely available ML model viewer. -# -# TVM provides a helper library to download pre-trained models. By providing a -# model URL, file name, and model type through the module, TVM will download -# the model and save it to disk. For the instance of an ONNX model, you can -# then load it into memory using the ONNX runtime. -# -# .. admonition:: Working with Other Model Formats -# -# TVM supports many popular model formats. A list can be found in the -# :ref:`Compile Deep Learning Models ` section of the TVM -# Documentation. - -model_url = ( - "https://github.com/onnx/models/raw/bd206494e8b6a27b25e5cf7199dbcdbfe9d05d1c/" - "vision/classification/resnet/model/" - "resnet50-v2-7.onnx" -) - -model_path = download_testdata(model_url, "resnet50-v2-7.onnx", module="onnx") -onnx_model = onnx.load(model_path) - -# Seed numpy's RNG to get consistent results -np.random.seed(0) - -################################################################################ -# Downloading, Preprocessing, and Loading the Test Image -# ------------------------------------------------------ -# -# Each model is particular when it comes to expected tensor shapes, formats and -# data types. For this reason, most models require some pre and -# post-processing, to ensure the input is valid and to interpret the output. -# TVMC has adopted NumPy's ``.npz`` format for both input and output data. -# -# As input for this tutorial, we will use the image of a cat, but you can feel -# free to substitute this image for any of your choosing. -# -# .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg -# :height: 224px -# :width: 224px -# :align: center -# -# Download the image data, then convert it to a numpy array to use as an input to the model. - -img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" -img_path = download_testdata(img_url, "imagenet_cat.png", module="data") - -# Resize it to 224x224 -resized_image = Image.open(img_path).resize((224, 224)) -img_data = np.asarray(resized_image).astype("float32") - -# Our input image is in HWC layout while ONNX expects CHW input, so convert the array -img_data = np.transpose(img_data, (2, 0, 1)) - -# Normalize according to the ImageNet input specification -imagenet_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) -imagenet_stddev = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) -norm_img_data = (img_data / 255 - imagenet_mean) / imagenet_stddev - -# Add the batch dimension, as we are expecting 4-dimensional input: NCHW. -img_data = np.expand_dims(norm_img_data, axis=0) - -############################################################################### -# Compile the Model With Relay -# ---------------------------- -# -# The next step is to compile the ResNet model. We begin by importing the model -# to relay using the `from_onnx` importer. We then build the model, with -# standard optimizations, into a TVM library. Finally, we create a TVM graph -# runtime module from the library. - -target = "llvm" - -###################################################################### -# .. admonition:: Defining the Correct Target -# -# Specifying the correct target can have a huge impact on the performance of -# the compiled module, as it can take advantage of hardware features -# available on the target. For more information, please refer to -# :ref:`Auto-tuning a convolutional network for x86 CPU `. -# We recommend identifying which CPU you are running, along with optional -# features, and set the target appropriately. For example, for some -# processors ``target = "llvm -mcpu=skylake"``, or ``target = "llvm -# -mcpu=skylake-avx512"`` for processors with the AVX-512 vector instruction -# set. -# - -# The input name may vary across model types. You can use a tool -# like Netron to check input names -input_name = "data" -shape_dict = {input_name: img_data.shape} - -mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) - -with tvm.transform.PassContext(opt_level=3): - lib = relay.build(mod, target=target, params=params) - -dev = tvm.device(str(target), 0) -module = graph_executor.GraphModule(lib["default"](dev)) - -###################################################################### -# Execute on the TVM Runtime -# -------------------------- -# Now that we've compiled the model, we can use the TVM runtime to make -# predictions with it. To use TVM to run the model and make predictions, we -# need two things: -# -# - The compiled model, which we just produced. -# - Valid input to the model to make predictions on. - -dtype = "float32" -module.set_input(input_name, img_data) -module.run() -output_shape = (1, 1000) -tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy() - -################################################################################ -# Collect Basic Performance Data -# ------------------------------ -# We want to collect some basic performance data associated with this -# unoptimized model and compare it to a tuned model later. To help account for -# CPU noise, we run the computation in multiple batches in multiple -# repetitions, then gather some basis statistics on the mean, median, and -# standard deviation. -import timeit - -timing_number = 10 -timing_repeat = 10 -unoptimized = ( - np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number)) - * 1000 - / timing_number -) -unoptimized = { - "mean": np.mean(unoptimized), - "median": np.median(unoptimized), - "std": np.std(unoptimized), -} - -print(unoptimized) - -################################################################################ -# Postprocess the output -# ---------------------- -# -# As previously mentioned, each model will have its own particular way of -# providing output tensors. -# -# In our case, we need to run some post-processing to render the outputs from -# ResNet-50 v2 into a more human-readable form, using the lookup-table provided -# for the model. - -from scipy.special import softmax - -# Download a list of labels -labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt" -labels_path = download_testdata(labels_url, "synset.txt", module="data") - -with open(labels_path, "r") as f: - labels = [l.rstrip() for l in f] - -# Open the output and read the output tensor -scores = softmax(tvm_output) -scores = np.squeeze(scores) -ranks = np.argsort(scores)[::-1] -for rank in ranks[0:5]: - print("class='%s' with probability=%f" % (labels[rank], scores[rank])) - -################################################################################ -# This should produce the following output: -# -# .. code-block:: bash -# -# # class='n02123045 tabby, tabby cat' with probability=0.610553 -# # class='n02123159 tiger cat' with probability=0.367179 -# # class='n02124075 Egyptian cat' with probability=0.019365 -# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 -# # class='n04040759 radiator' with probability=0.000261 - -################################################################################ -# Tune the model -# -------------- -# The previous model was compiled to work on the TVM runtime, but did not -# include any platform specific optimization. In this section, we will show you -# how to build an optimized model using TVM to target your working platform. -# -# In some cases, we might not get the expected performance when running -# inferences using our compiled module. In cases like this, we can make use of -# the auto-tuner, to find a better configuration for our model and get a boost -# in performance. Tuning in TVM refers to the process by which a model is -# optimized to run faster on a given target. This differs from training or -# fine-tuning in that it does not affect the accuracy of the model, but only -# the runtime performance. As part of the tuning process, TVM will try running -# many different operator implementation variants to see which perform best. -# The results of these runs are stored in a tuning records file. -# -# In the simplest form, tuning requires you to provide three things: -# -# - the target specification of the device you intend to run this model on -# - the path to an output file in which the tuning records will be stored -# - a path to the model to be tuned. -# - -import tvm.auto_scheduler as auto_scheduler -from tvm.autotvm.tuner import XGBTuner -from tvm import autotvm - -################################################################################ -# Set up some basic parameters for the runner. The runner takes compiled code -# that is generated with a specific set of parameters and measures the -# performance of it. ``number`` specifies the number of different -# configurations that we will test, while ``repeat`` specifies how many -# measurements we will take of each configuration. ``min_repeat_ms`` is a value -# that specifies how long need to run configuration test. If the number of -# repeats falls under this time, it will be increased. This option is necessary -# for accurate tuning on GPUs, and is not required for CPU tuning. Setting this -# value to 0 disables it. The ``timeout`` places an upper limit on how long to -# run training code for each tested configuration. - -number = 10 -repeat = 1 -min_repeat_ms = 0 # since we're tuning on a CPU, can be set to 0 -timeout = 10 # in seconds - -# create a TVM runner -runner = autotvm.LocalRunner( - number=number, - repeat=repeat, - timeout=timeout, - min_repeat_ms=min_repeat_ms, - enable_cpu_cache_flush=True, -) - -################################################################################ -# Create a simple structure for holding tuning options. We use an XGBoost -# algorithim for guiding the search. For a production job, you will want to set -# the number of trials to be larger than the value of 20 used here. For CPU we -# recommend 1500, for GPU 3000-4000. The number of trials required can depend -# on the particular model and processor, so it's worth spending some time -# evaluating performance across a range of values to find the best balance -# between tuning time and model optimization. Because running tuning is time -# intensive we set number of trials to 10, but do not recommend a value this -# small. The ``early_stopping`` parameter is the minimum number of trails to -# run before a condition that stops the search early can be applied. The -# measure option indicates where trial code will be built, and where it will be -# run. In this case, we're using the ``LocalRunner`` we just created and a -# ``LocalBuilder``. The ``tuning_records`` option specifies a file to write -# the tuning data to. - -tuning_option = { - "tuner": "xgb", - "trials": 20, - "early_stopping": 100, - "measure_option": autotvm.measure_option( - builder=autotvm.LocalBuilder(build_func="default"), runner=runner - ), - "tuning_records": "resnet-50-v2-autotuning.json", -} - -################################################################################ -# .. admonition:: Defining the Tuning Search Algorithm -# -# By default this search is guided using an `XGBoost Grid` algorithm. -# Depending on your model complexity and amount of time available, you might -# want to choose a different algorithm. - - -################################################################################ -# .. admonition:: Setting Tuning Parameters -# -# In this example, in the interest of time, we set the number of trials and -# early stopping to 20 and 100. You will likely see more performance improvements if -# you set these values to be higher but this comes at the expense of time -# spent tuning. The number of trials required for convergence will vary -# depending on the specifics of the model and the target platform. - -# begin by extracting the tasks from the onnx model -tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params) - -# Tune the extracted tasks sequentially. -for i, task in enumerate(tasks): - prefix = "[Task %2d/%2d] " % (i + 1, len(tasks)) - - # choose tuner - tuner = "xgb" - - # create tuner - if tuner == "xgb": - tuner_obj = XGBTuner(task, loss_type="reg") - elif tuner == "xgb_knob": - tuner_obj = XGBTuner(task, loss_type="reg", feature_type="knob") - elif tuner == "xgb_itervar": - tuner_obj = XGBTuner(task, loss_type="reg", feature_type="itervar") - elif tuner == "xgb_curve": - tuner_obj = XGBTuner(task, loss_type="reg", feature_type="curve") - elif tuner == "xgb_rank": - tuner_obj = XGBTuner(task, loss_type="rank") - elif tuner == "xgb_rank_knob": - tuner_obj = XGBTuner(task, loss_type="rank", feature_type="knob") - elif tuner == "xgb_rank_itervar": - tuner_obj = XGBTuner(task, loss_type="rank", feature_type="itervar") - elif tuner == "xgb_rank_curve": - tuner_obj = XGBTuner(task, loss_type="rank", feature_type="curve") - elif tuner == "xgb_rank_binary": - tuner_obj = XGBTuner(task, loss_type="rank-binary") - elif tuner == "xgb_rank_binary_knob": - tuner_obj = XGBTuner(task, loss_type="rank-binary", feature_type="knob") - elif tuner == "xgb_rank_binary_itervar": - tuner_obj = XGBTuner(task, loss_type="rank-binary", feature_type="itervar") - elif tuner == "xgb_rank_binary_curve": - tuner_obj = XGBTuner(task, loss_type="rank-binary", feature_type="curve") - elif tuner == "ga": - tuner_obj = GATuner(task, pop_size=50) - elif tuner == "random": - tuner_obj = RandomTuner(task) - elif tuner == "gridsearch": - tuner_obj = GridSearchTuner(task) - else: - raise ValueError("Invalid tuner: " + tuner) - - tuner_obj.tune( - n_trial=min(tuning_option["trials"], len(task.config_space)), - early_stopping=tuning_option["early_stopping"], - measure_option=tuning_option["measure_option"], - callbacks=[ - autotvm.callback.progress_bar(tuning_option["trials"], prefix=prefix), - autotvm.callback.log_to_file(tuning_option["tuning_records"]), - ], - ) - -################################################################################ -# The output from this tuning process will look something like this: -# -# .. code-block:: bash -# -# # [Task 1/24] Current/Best: 10.71/ 21.08 GFLOPS | Progress: (60/1000) | 111.77 s Done. -# # [Task 1/24] Current/Best: 9.32/ 24.18 GFLOPS | Progress: (192/1000) | 365.02 s Done. -# # [Task 2/24] Current/Best: 22.39/ 177.59 GFLOPS | Progress: (960/1000) | 976.17 s Done. -# # [Task 3/24] Current/Best: 32.03/ 153.34 GFLOPS | Progress: (800/1000) | 776.84 s Done. -# # [Task 4/24] Current/Best: 11.96/ 156.49 GFLOPS | Progress: (960/1000) | 632.26 s Done. -# # [Task 5/24] Current/Best: 23.75/ 130.78 GFLOPS | Progress: (800/1000) | 739.29 s Done. -# # [Task 6/24] Current/Best: 38.29/ 198.31 GFLOPS | Progress: (1000/1000) | 624.51 s Done. -# # [Task 7/24] Current/Best: 4.31/ 210.78 GFLOPS | Progress: (1000/1000) | 701.03 s Done. -# # [Task 8/24] Current/Best: 50.25/ 185.35 GFLOPS | Progress: (972/1000) | 538.55 s Done. -# # [Task 9/24] Current/Best: 50.19/ 194.42 GFLOPS | Progress: (1000/1000) | 487.30 s Done. -# # [Task 10/24] Current/Best: 12.90/ 172.60 GFLOPS | Progress: (972/1000) | 607.32 s Done. -# # [Task 11/24] Current/Best: 62.71/ 203.46 GFLOPS | Progress: (1000/1000) | 581.92 s Done. -# # [Task 12/24] Current/Best: 36.79/ 224.71 GFLOPS | Progress: (1000/1000) | 675.13 s Done. -# # [Task 13/24] Current/Best: 7.76/ 219.72 GFLOPS | Progress: (1000/1000) | 519.06 s Done. -# # [Task 14/24] Current/Best: 12.26/ 202.42 GFLOPS | Progress: (1000/1000) | 514.30 s Done. -# # [Task 15/24] Current/Best: 31.59/ 197.61 GFLOPS | Progress: (1000/1000) | 558.54 s Done. -# # [Task 16/24] Current/Best: 31.63/ 206.08 GFLOPS | Progress: (1000/1000) | 708.36 s Done. -# # [Task 17/24] Current/Best: 41.18/ 204.45 GFLOPS | Progress: (1000/1000) | 736.08 s Done. -# # [Task 18/24] Current/Best: 15.85/ 222.38 GFLOPS | Progress: (980/1000) | 516.73 s Done. -# # [Task 19/24] Current/Best: 15.78/ 203.41 GFLOPS | Progress: (1000/1000) | 587.13 s Done. -# # [Task 20/24] Current/Best: 30.47/ 205.92 GFLOPS | Progress: (980/1000) | 471.00 s Done. -# # [Task 21/24] Current/Best: 46.91/ 227.99 GFLOPS | Progress: (308/1000) | 219.18 s Done. -# # [Task 22/24] Current/Best: 13.33/ 207.66 GFLOPS | Progress: (1000/1000) | 761.74 s Done. -# # [Task 23/24] Current/Best: 53.29/ 192.98 GFLOPS | Progress: (1000/1000) | 799.90 s Done. -# # [Task 24/24] Current/Best: 25.03/ 146.14 GFLOPS | Progress: (1000/1000) | 1112.55 s Done. - -################################################################################ -# Compiling an Optimized Model with Tuning Data -# ---------------------------------------------- -# -# As an output of the tuning process above, we obtained the tuning records -# stored in ``resnet-50-v2-autotuning.json``. The compiler will use the results to -# generate high performance code for the model on your specified target. -# -# Now that tuning data for the model has been collected, we can re-compile the -# model using optimized operators to speed up our computations. - -with autotvm.apply_history_best(tuning_option["tuning_records"]): - with tvm.transform.PassContext(opt_level=3, config={}): - lib = relay.build(mod, target=target, params=params) - -dev = tvm.device(str(target), 0) -module = graph_executor.GraphModule(lib["default"](dev)) - -################################################################################ -# Verify that the optimized model runs and produces the same results: - -dtype = "float32" -module.set_input(input_name, img_data) -module.run() -output_shape = (1, 1000) -tvm_output = module.get_output(0, tvm.nd.empty(output_shape)).numpy() - -scores = softmax(tvm_output) -scores = np.squeeze(scores) -ranks = np.argsort(scores)[::-1] -for rank in ranks[0:5]: - print("class='%s' with probability=%f" % (labels[rank], scores[rank])) - -################################################################################ -# Verifying that the predictions are the same: -# -# .. code-block:: bash -# -# # class='n02123045 tabby, tabby cat' with probability=0.610550 -# # class='n02123159 tiger cat' with probability=0.367181 -# # class='n02124075 Egyptian cat' with probability=0.019365 -# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 -# # class='n04040759 radiator' with probability=0.000261 - -################################################################################ -# Comparing the Tuned and Untuned Models -# -------------------------------------- -# We want to collect some basic performance data associated with this optimized -# model to compare it to the unoptimized model. Depending on your underlying -# hardware, number of iterations, and other factors, you should see a performance -# improvement in comparing the optimized model to the unoptimized model. - -import timeit - -timing_number = 10 -timing_repeat = 10 -optimized = ( - np.array(timeit.Timer(lambda: module.run()).repeat(repeat=timing_repeat, number=timing_number)) - * 1000 - / timing_number -) -optimized = {"mean": np.mean(optimized), "median": np.median(optimized), "std": np.std(optimized)} - - -print("optimized: %s" % (optimized)) -print("unoptimized: %s" % (unoptimized)) - -################################################################################ -# Final Remarks -# ------------- -# -# In this tutorial, we gave a short example of how to use the TVM Python API -# to compile, run, and tune a model. We also discussed the need for pre and -# post-processing of inputs and outputs. After the tuning process, we -# demonstrated how to compare the performance of the unoptimized and optimize -# models. -# -# Here we presented a simple example using ResNet-50 v2 locally. However, TVM -# supports many more features including cross-compilation, remote execution and -# profiling/benchmarking. diff --git a/gallery/tutorial/intro_topi.py b/gallery/tutorial/intro_topi.py deleted file mode 100644 index cfebc36b8128..000000000000 --- a/gallery/tutorial/intro_topi.py +++ /dev/null @@ -1,153 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-topi: - -Introduction to TOPI -==================== -**Author**: `Ehsan M. Kermani `_ - -This is an introductory tutorial to TVM Operator Inventory (TOPI). -TOPI provides numpy-style generic operations and schedules with higher abstractions than TVM. -In this tutorial, we will see how TOPI can save us from writing boilerplate code in TVM. -""" - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import tvm -import tvm.testing -from tvm import te -from tvm import topi -import numpy as np - -###################################################################### -# Basic example -# ------------- -# Let's revisit the sum of rows operation (equivalent to :code:`B = numpy.sum(A, axis=1)`') \ -# To compute the sum of rows of a two dimensional TVM tensor A, we should -# specify the symbolic operation as well as schedule as follows -# -n = te.var("n") -m = te.var("m") -A = te.placeholder((n, m), name="A") -k = te.reduce_axis((0, m), "k") -B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B") -s = te.create_schedule(B.op) - -###################################################################### -# and to examine the IR code in human readable format, we can do -# -print(tvm.lower(s, [A], simple_mode=True)) - -###################################################################### -# However, for such a common operation we had to define the reduce axis ourselves as well as explicit computation with -# :code:`te.compute`. Imagine for more complicated operations how much details we need to provide. -# Fortunately, we can replace those two lines with simple :code:`topi.sum` much like :code:`numpy.sum` -# -C = topi.sum(A, axis=1) -ts = te.create_schedule(C.op) -print(tvm.lower(ts, [A], simple_mode=True)) - -###################################################################### -# Numpy-style operator overloading -# -------------------------------- -# We can add two tensors using :code:`topi.broadcast_add` that have correct (broadcastable with specific) shapes. -# Even shorter, TOPI provides operator overloading for such common operations. For example, -# -x, y = 100, 10 -a = te.placeholder((x, y, y), name="a") -b = te.placeholder((y, y), name="b") -c = a + b # same as topi.broadcast_add -d = a * b # same as topi.broadcast_mul - -###################################################################### -# Overloaded with the same syntax, TOPI handles broadcasting a primitive (`int`, `float`) to a tensor :code:`d - 3.14`. - -###################################################################### -# Generic schedules and fusing operations -# --------------------------------------- -# Up to now, we have seen an example of how TOPI can save us from writing explicit computations in lower level API. -# But it doesn't stop here. Still we did the scheduling as before. TOPI also provides higher level -# scheduling recipes depending on a given context. For example, for CUDA, -# we can schedule the following series of operations ending with :code:`topi.sum` using only -# :code:`topi.generic.schedule_reduce` -# -e = topi.elemwise_sum([c, d]) -f = e / 2.0 -g = topi.sum(f) -with tvm.target.cuda(): - sg = topi.cuda.schedule_reduce(g) - print(tvm.lower(sg, [a, b], simple_mode=True)) - -###################################################################### -# As you can see, scheduled stages of computation have been accumulated and we can examine them by -# -print(sg.stages) - -###################################################################### -# We can test the correctness by comparing with :code:`numpy` result as follows -# -func = tvm.build(sg, [a, b, g], "cuda") -dev = tvm.cuda(0) -a_np = np.random.uniform(size=(x, y, y)).astype(a.dtype) -b_np = np.random.uniform(size=(y, y)).astype(b.dtype) -g_np = np.sum(np.add(a_np + b_np, a_np * b_np) / 2.0) -a_nd = tvm.nd.array(a_np, dev) -b_nd = tvm.nd.array(b_np, dev) -g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), dev) -func(a_nd, b_nd, g_nd) -tvm.testing.assert_allclose(g_nd.numpy(), g_np, rtol=1e-5) - -###################################################################### -# TOPI also provides common neural nets operations such as _softmax_ with optimized schedule -# -tarray = te.placeholder((512, 512), name="tarray") -softmax_topi = topi.nn.softmax(tarray) -with tvm.target.Target("cuda"): - sst = topi.cuda.schedule_softmax(softmax_topi) - print(tvm.lower(sst, [tarray], simple_mode=True)) - -###################################################################### -# Fusing convolutions -# ------------------- -# We can fuse :code:`topi.nn.conv2d` and :code:`topi.nn.relu` together. -# -# .. note:: -# -# TOPI functions are all generic functions. They have different implementations -# for different backends to optimize for performance. -# For each backend, it is necessary to call them under a target scope for both -# compute declaration and schedule. TVM will choose the right function to call with -# the target information. - -data = te.placeholder((1, 3, 224, 224)) -kernel = te.placeholder((10, 3, 5, 5)) - -with tvm.target.Target("cuda"): - conv = topi.cuda.conv2d_nchw(data, kernel, 1, 2, 1) - out = topi.nn.relu(conv) - sconv = topi.cuda.schedule_conv2d_nchw([out]) - print(tvm.lower(sconv, [data, kernel], simple_mode=True)) - -###################################################################### -# Summary -# ------- -# In this tutorial, we have seen -# -# - How to use TOPI API for common operations with numpy-style operators. -# - How TOPI facilitates generic schedules and operator fusion for a context, to generate optimized kernel codes. diff --git a/gallery/tutorial/introduction.py b/gallery/tutorial/introduction.py deleted file mode 100644 index 4b94b23cf944..000000000000 --- a/gallery/tutorial/introduction.py +++ /dev/null @@ -1,133 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Introduction -============ -**Authors**: -`Jocelyn Shiue `_, -`Chris Hoge `_, -`Lianmin Zheng `_ - -Apache TVM is an open source machine learning compiler framework for CPUs, -GPUs, and machine learning accelerators. It aims to enable machine learning -engineers to optimize and run computations efficiently on any hardware backend. -The purpose of this tutorial is to take a guided tour through all of the major -features of TVM by defining and demonstrating key concepts. A new user should -be able to work through the tutorial from start to finish and be able to -operate TVM for automatic model optimization, while having a basic -understanding of the TVM architecture and how it works. - -Contents --------- - -#. :doc:`Introduction ` -#. :doc:`Compiling and Optimizing a Model with the Command Line Interface ` -#. :doc:`Compiling and Optimizing a Model with the Python Interface ` -#. :doc:`Working with Operators Using Tensor Expression ` -#. :doc:`Optimizing Operators with Templates and AutoTVM ` -#. :doc:`Optimizing Operators with Template-free AutoScheduler ` -#. :doc:`Compiling Deep Learning Models for GPUs ` -""" - - -################################################################################ -# An Overview of TVM and Model Optimization -# ========================================= -# -# The diagram below illustrates the steps a machine model takes as it is -# transformed with the TVM optimizing compiler framework. -# -# .. image:: https://raw.githubusercontent.com/apache/tvm-site/main/images/tutorial/overview.png -# :width: 100% -# :alt: A High Level View of TVM -# -# 1. Import the model from a framework like *Tensorflow*, *PyTorch*, or *Onnx*. -# The importer layer is where TVM can ingest models from other frameworks, like -# Tensorflow, PyTorch, or ONNX. The level of support that TVM offers for each -# frontend varies as we are constantly improving the open source project. If -# you're having issues importing your model into TVM, you may want to try -# converting it to ONNX. -# -# 2. Translate to *Relay*, TVM's high-level model language. -# A model that has been imported into TVM is represented in Relay. Relay is a -# functional language and intermediate representation (IR) for neural networks. -# It has support for: -# -# - Traditional data flow-style representations -# - Functional-style scoping, let-binding which makes it a fully featured -# differentiable language -# - Ability to allow the user to mix the two programming styles -# -# Relay applies graph-level optimization passes to optimize the model. -# -# 3. Lower to *Tensor Expression* (TE) representation. Lowering is when a -# higher-level representation is transformed into a lower-level -# representation. After applying the high-level optimizations, Relay -# runs FuseOps pass to partition the model into many small subgraphs and lowers -# the subgraphs to TE representation. Tensor Expression (TE) is a -# domain-specific language for describing tensor computations. -# TE also provides several *schedule* primitives to specify low-level loop -# optimizations, such as tiling, vectorization, parallelization, -# unrolling, and fusion. -# To aid in the process of converting Relay representation into TE representation, -# TVM includes a Tensor Operator Inventory (TOPI) that has pre-defined -# templates of common tensor operators (e.g., conv2d, transpose). -# -# 4. Search for the best schedule using the auto-tuning module *AutoTVM* or *AutoScheduler*. -# A schedule specifies the low-level loop optimizations for an operator or -# subgraph defined in TE. Auto-tuning modules search for the best schedule -# and compare them with cost models and on-device measurements. -# There are two auto-tuning modules in TVM. -# -# - **AutoTVM**: A template-based auto-tuning module. It runs search algorithms -# to find the best values for the tunable knobs in a user-defined template. -# For common operators, their templates are already provided in TOPI. -# - **AutoScheduler (a.k.a. Ansor)**: A template-free auto-tuning module. -# It does not require pre-defined schedule templates. Instead, it generates -# the search space automatically by analyzing the computation definition. -# It then searches for the best schedule in the generated search space. -# -# 5. Choose the optimal configurations for model compilation. After tuning, the -# auto-tuning module generates tuning records in JSON format. This step -# picks the best schedule for each subgraph. -# -# 6. Lower to Tensor Intermediate Representation (TIR), TVM's low-level -# intermediate representation. After selecting the optimal configurations -# based on the tuning step, each TE subgraph is lowered to TIR and be -# optimized by low-level optimization passes. Next, the optimized TIR is -# lowered to the target compiler of the hardware platform. -# This is the final code generation phase to produce an optimized model -# that can be deployed into production. TVM supports several different -# compiler backends including: -# -# - LLVM, which can target arbitrary microprocessor architecture including -# standard x86 and ARM processors, AMDGPU and NVPTX code generation, and any -# other platform supported by LLVM. -# - Specialized compilers, such as NVCC, NVIDIA's compiler. -# - Embedded and specialized targets, which are implemented through TVM's -# Bring Your Own Codegen (BYOC) framework. -# -# 7. Compile down to machine code. At the end of this process, the -# compiler-specific generated code can be lowered to machine code. -# -# TVM can compile models down to a linkable object module, which can then be -# run with a lightweight TVM runtime that provides C APIs to dynamically -# load the model, and entry points for other languages such as Python and -# Rust. TVM can also build a bundled deployment in which the runtime is -# combined with the model in a single package. -# -# The remainder of the tutorial will cover these aspects of TVM in more detail. diff --git a/gallery/tutorial/relay_quick_start.py b/gallery/tutorial/relay_quick_start.py deleted file mode 100644 index 0cbe35b3e075..000000000000 --- a/gallery/tutorial/relay_quick_start.py +++ /dev/null @@ -1,158 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-relay-quick-start: - -Quick Start Tutorial for Compiling Deep Learning Models -======================================================= -**Author**: `Yao Wang `_, `Truman Tian `_ - -This example shows how to build a neural network with Relay python frontend and -generates a runtime library for Nvidia GPU with TVM. -Notice that you need to build TVM with cuda and llvm enabled. -""" - -###################################################################### -# Overview for Supported Hardware Backend of TVM -# ---------------------------------------------- -# The image below shows hardware backend currently supported by TVM: -# -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/tvm_support_list.png -# :align: center -# -# In this tutorial, we'll choose cuda and llvm as target backends. -# To begin with, let's import Relay and TVM. - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import numpy as np - -from tvm import relay -from tvm.relay import testing -import tvm -from tvm import te -from tvm.contrib import graph_executor -import tvm.testing - -###################################################################### -# Define Neural Network in Relay -# ------------------------------ -# First, let's define a neural network with relay python frontend. -# For simplicity, we'll use pre-defined resnet-18 network in Relay. -# Parameters are initialized with Xavier initializer. -# Relay also supports other model formats such as MXNet, CoreML, ONNX and -# Tensorflow. -# -# In this tutorial, we assume we will do inference on our device and -# the batch size is set to be 1. Input images are RGB color images of -# size 224 * 224. We can call the -# :py:meth:`tvm.relay.expr.TupleWrapper.astext()` to show the network -# structure. - -batch_size = 1 -num_class = 1000 -image_shape = (3, 224, 224) -data_shape = (batch_size,) + image_shape -out_shape = (batch_size, num_class) - -mod, params = relay.testing.resnet.get_workload( - num_layers=18, batch_size=batch_size, image_shape=image_shape -) - -# set show_meta_data=True if you want to show meta data -print(mod.astext(show_meta_data=False)) - -###################################################################### -# Compilation -# ----------- -# Next step is to compile the model using the Relay/TVM pipeline. -# Users can specify the optimization level of the compilation. -# Currently this value can be 0 to 3. The optimization passes include -# operator fusion, pre-computation, layout transformation and so on. -# -# :py:func:`relay.build` returns three components: the execution graph in -# json format, the TVM module library of compiled functions specifically -# for this graph on the target hardware, and the parameter blobs of -# the model. During the compilation, Relay does the graph-level -# optimization while TVM does the tensor-level optimization, resulting -# in an optimized runtime module for model serving. -# -# We'll first compile for Nvidia GPU. Behind the scene, :py:func:`relay.build` -# first does a number of graph-level optimizations, e.g. pruning, fusing, etc., -# then registers the operators (i.e. the nodes of the optimized graphs) to -# TVM implementations to generate a `tvm.module`. -# To generate the module library, TVM will first transfer the high level IR -# into the lower intrinsic IR of the specified target backend, which is CUDA -# in this example. Then the machine code will be generated as the module library. - -opt_level = 3 -target = tvm.target.cuda() -with tvm.transform.PassContext(opt_level=opt_level): - lib = relay.build(mod, target, params=params) - -##################################################################### -# Run the generate library -# ------------------------ -# Now we can create graph executor and run the module on Nvidia GPU. - -# create random input -dev = tvm.cuda() -data = np.random.uniform(-1, 1, size=data_shape).astype("float32") -# create module -module = graph_executor.GraphModule(lib["default"](dev)) -# set input and parameters -module.set_input("data", data) -# run -module.run() -# get output -out = module.get_output(0, tvm.nd.empty(out_shape)).numpy() - -# Print first 10 elements of output -print(out.flatten()[0:10]) - -###################################################################### -# Save and Load Compiled Module -# ----------------------------- -# We can also save the graph, lib and parameters into files and load them -# back in deploy environment. - -#################################################### - -# save the graph, lib and params into separate files -from tvm.contrib import utils - -temp = utils.tempdir() -path_lib = temp.relpath("deploy_lib.tar") -lib.export_library(path_lib) -print(temp.listdir()) - -#################################################### - -# load the module back. -loaded_lib = tvm.runtime.load_module(path_lib) -input_data = tvm.nd.array(data) - -module = graph_executor.GraphModule(loaded_lib["default"](dev)) -module.run(data=input_data) -out_deploy = module.get_output(0).numpy() - -# Print first 10 elements of output -print(out_deploy.flatten()[0:10]) - -# check whether the output from deployed module is consistent with original one -tvm.testing.assert_allclose(out_deploy, out, atol=1e-5) diff --git a/gallery/tutorial/tensor_expr_get_started.py b/gallery/tutorial/tensor_expr_get_started.py deleted file mode 100644 index ba7e0c027023..000000000000 --- a/gallery/tutorial/tensor_expr_get_started.py +++ /dev/null @@ -1,905 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-tensor-expr-get-started: - -Working with Operators Using Tensor Expression -============================================== -**Author**: `Tianqi Chen `_ - -In this tutorial we will turn our attention to how TVM works with Tensor -Expression (TE) to define tensor computations and apply loop optimizations. TE -describes tensor computations in a pure functional language (that is each -expression has no side effects). When viewed in context of the TVM as a whole, -Relay describes a computation as a set of operators, and each of these -operators can be represented as a TE expression where each TE expression takes -input tensors and produces an output tensor. - -This is an introductory tutorial to the Tensor Expression language in TVM. TVM -uses a domain specific tensor expression for efficient kernel construction. We -will demonstrate the basic workflow with two examples of using the tensor expression -language. The first example introduces TE and scheduling with vector -addition. The second expands on these concepts with a step-by-step optimization -of a matrix multiplication with TE. This matrix multiplication example will -serve as the comparative basis for future tutorials covering more advanced -features of TVM. -""" - - -################################################################################ -# Example 1: Writing and Scheduling Vector Addition in TE for CPU -# --------------------------------------------------------------- -# -# Let's look at an example in Python in which we will implement a TE for -# vector addition, followed by a schedule targeted towards a CPU. -# We begin by initializing a TVM environment. - -import tvm -import tvm.testing -from tvm import te -import numpy as np - -################################################################################ -# You will get better performance if you can identify the CPU you are targeting -# and specify it. If you're using LLVM, you can get this information from the -# command ``llc --version`` to get the CPU type, and you can check -# ``/proc/cpuinfo`` for additional extensions that your processor might -# support. For example, you can use ``llvm -mcpu=skylake-avx512`` for CPUs with -# AVX-512 instructions. - -tgt = tvm.target.Target(target="llvm", host="llvm") - -################################################################################ -# Describing the Vector Computation -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# We describe a vector addition computation. TVM adopts tensor semantics, with -# each intermediate result represented as a multi-dimensional array. The user -# needs to describe the computation rule that generates the tensors. We first -# define a symbolic variable ``n`` to represent the shape. We then define two -# placeholder Tensors, ``A`` and ``B``, with given shape ``(n,)``. We then -# describe the result tensor ``C``, with a ``compute`` operation. The -# ``compute`` defines a computation, with the output conforming to the -# specified tensor shape and the computation to be performed at each position -# in the tensor defined by the lambda function. Note that while ``n`` is a -# variable, it defines a consistent shape between the ``A``, ``B`` and ``C`` -# tensors. Remember, no actual computation happens during this phase, as we -# are only declaring how the computation should be done. - -n = te.var("n") -A = te.placeholder((n,), name="A") -B = te.placeholder((n,), name="B") -C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - -################################################################################ -# .. admonition:: Lambda Functions -# -# The second argument to the ``te.compute`` method is the function that -# performs the computation. In this example, we're using an anonymous function, -# also known as a ``lambda`` function, to define the computation, in this case -# addition on the ``i``\th element of ``A`` and ``B``. - -################################################################################ -# Create a Default Schedule for the Computation -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# While the above lines describe the computation rule, we can compute ``C`` in -# many different ways to fit different devices. For a tensor with multiple -# axes, you can choose which axis to iterate over first, or computations can be -# split across different threads. TVM requires that the user to provide a -# schedule, which is a description of how the computation should be performed. -# Scheduling operations within TE can change loop orders, split computations -# across different threads, and group blocks of data together, amongst other -# operations. An important concept behind schedules is that they only describe -# how the computation is performed, so different schedules for the same TE will -# produce the same result. -# -# TVM allows you to create a naive schedule that will compute ``C`` in by -# iterating in row major order. -# -# .. code-block:: c -# -# for (int i = 0; i < n; ++i) { -# C[i] = A[i] + B[i]; -# } - -s = te.create_schedule(C.op) - -###################################################################### -# Compile and Evaluate the Default Schedule -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# With the TE expression and a schedule, we can produce runnable code for our -# target language and architecture, in this case LLVM and a CPU. We provide -# TVM with the schedule, a list of the TE expressions that are in the schedule, -# the target and host, and the name of the function we are producing. The result -# of the output is a type-erased function that can be called directly from Python. -# -# In the following line, we use ``tvm.build`` to create a function. The build -# function takes the schedule, the desired signature of the function (including -# the inputs and outputs) as well as target language we want to compile to. - -fadd = tvm.build(s, [A, B, C], tgt, name="myadd") - -################################################################################ -# Let's run the function, and compare the output to the same computation in -# numpy. The compiled TVM function exposes a concise C API that can be invoked -# from any language. We begin by creating a device, which is a device (CPU in this -# example) that TVM can compile the schedule to. In this case the device is an -# LLVM CPU target. We can then initialize the tensors in our device and -# perform the custom addition operation. To verify that the computation is -# correct, we can compare the result of the output of the c tensor to the same -# computation performed by numpy. - -dev = tvm.device(tgt.kind.name, 0) - -n = 1024 -a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) -b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) -c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) -fadd(a, b, c) -tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - -################################################################################ -# To get a comparison of how fast this version is compared to numpy, create a -# helper function to run a profile of the TVM generated code. -import timeit - -np_repeat = 100 -np_running_time = timeit.timeit( - setup="import numpy\n" - "n = 32768\n" - 'dtype = "float32"\n' - "a = numpy.random.rand(n, 1).astype(dtype)\n" - "b = numpy.random.rand(n, 1).astype(dtype)\n", - stmt="answer = a + b", - number=np_repeat, -) -print("Numpy running time: %f" % (np_running_time / np_repeat)) - - -def evaluate_addition(func, target, optimization, log): - dev = tvm.device(target.kind.name, 0) - n = 32768 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - - evaluator = func.time_evaluator(func.entry_name, dev, number=10) - mean_time = evaluator(a, b, c).mean - print("%s: %f" % (optimization, mean_time)) - - log.append((optimization, mean_time)) - - -log = [("numpy", np_running_time / np_repeat)] -evaluate_addition(fadd, tgt, "naive", log=log) - -################################################################################ -# Updating the Schedule to Use Parallelism -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Now that we've illustrated the fundamentals of TE, let's go deeper into what -# schedules do, and how they can be used to optimize tensor expressions for -# different architectures. A schedule is a series of steps that are applied to -# an expression to transform it in a number of different ways. When a schedule -# is applied to an expression in TE, the inputs and outputs remain the same, -# but when compiled the implementation of the expression can change. This -# tensor addition, in the default schedule, is run serially but is easy to -# parallelize across all of the processor threads. We can apply the parallel -# schedule operation to our computation. - -s[C].parallel(C.op.axis[0]) - -################################################################################ -# The ``tvm.lower`` command will generate the Intermediate Representation (IR) -# of the TE, with the corresponding schedule. By lowering the expression as we -# apply different schedule operations, we can see the effect of scheduling on -# the ordering of the computation. We use the flag ``simple_mode=True`` to -# return a readable C-style statement. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# It's now possible for TVM to run these blocks on independent threads. Let's -# compile and run this new schedule with the parallel operation applied: - -fadd_parallel = tvm.build(s, [A, B, C], tgt, name="myadd_parallel") -fadd_parallel(a, b, c) - -tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - -evaluate_addition(fadd_parallel, tgt, "parallel", log=log) - -################################################################################ -# Updating the Schedule to Use Vectorization -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Modern CPUs also have the ability to perform SIMD operations on floating -# point values, and we can apply another schedule to our computation expression -# to take advantage of this. Accomplishing this requires multiple steps: first -# we have to split the schedule into inner and outer loops using the split -# scheduling primitive. The inner loops can use vectorization to use SIMD -# instructions using the vectorize scheduling primitive, then the outer loops -# can be parallelized using the parallel scheduling primitive. Choose the split -# factor to be the number of threads on your CPU. - -# Recreate the schedule, since we modified it with the parallel operation in -# the previous example -n = te.var("n") -A = te.placeholder((n,), name="A") -B = te.placeholder((n,), name="B") -C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - -s = te.create_schedule(C.op) - -# This factor should be chosen to match the number of threads appropriate for -# your CPU. This will vary depending on architecture, but a good rule is -# setting this factor to equal the number of available CPU cores. -factor = 4 - -outer, inner = s[C].split(C.op.axis[0], factor=factor) -s[C].parallel(outer) -s[C].vectorize(inner) - -fadd_vector = tvm.build(s, [A, B, C], tgt, name="myadd_parallel") - -evaluate_addition(fadd_vector, tgt, "vector", log=log) - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Comparing the Different Schedules -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# We can now compare the different schedules - -baseline = log[0][1] -print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20))) -for result in log: - print( - "%s\t%s\t%s" - % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20)) - ) - - -################################################################################ -# .. admonition:: Code Specialization -# -# As you may have noticed, the declarations of ``A``, ``B`` and ``C`` all -# take the same shape argument, ``n``. TVM will take advantage of this to -# pass only a single shape argument to the kernel, as you will find in the -# printed device code. This is one form of specialization. -# -# On the host side, TVM will automatically generate check code that checks -# the constraints in the parameters. So if you pass arrays with different -# shapes into fadd, an error will be raised. -# -# We can do more specializations. For example, we can write :code:`n = -# tvm.runtime.convert(1024)` instead of :code:`n = te.var("n")`, in the -# computation declaration. The generated function will only take vectors with -# length 1024. - -################################################################################ -# We've defined, scheduled, and compiled a vector addition operator, which we -# were then able to execute on the TVM runtime. We can save the operator as a -# library, which we can then load later using the TVM runtime. - -################################################################################ -# Targeting Vector Addition for GPUs (Optional) -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# TVM is capable of targeting multiple architectures. In the next example, we -# will target compilation of the vector addition to GPUs. - -# If you want to run this code, change ``run_cuda = True`` -# Note that by default this example is not run in the docs CI. - -run_cuda = False -if run_cuda: - # Change this target to the correct backend for you gpu. For example: cuda (NVIDIA GPUs), - # rocm (Radeon GPUS), OpenCL (opencl). - tgt_gpu = tvm.target.Target(target="cuda", host="llvm") - - # Recreate the schedule - n = te.var("n") - A = te.placeholder((n,), name="A") - B = te.placeholder((n,), name="B") - C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - print(type(C)) - - s = te.create_schedule(C.op) - - bx, tx = s[C].split(C.op.axis[0], factor=64) - - ################################################################################ - # Finally we must bind the iteration axis bx and tx to threads in the GPU - # compute grid. The naive schedule is not valid for GPUs, and these are - # specific constructs that allow us to generate code that runs on a GPU. - - s[C].bind(bx, te.thread_axis("blockIdx.x")) - s[C].bind(tx, te.thread_axis("threadIdx.x")) - - ###################################################################### - # Compilation - # ----------- - # After we have finished specifying the schedule, we can compile it - # into a TVM function. By default TVM compiles into a type-erased - # function that can be directly called from the python side. - # - # In the following line, we use tvm.build to create a function. - # The build function takes the schedule, the desired signature of the - # function (including the inputs and outputs) as well as target language - # we want to compile to. - # - # The result of compilation fadd is a GPU device function (if GPU is - # involved) as well as a host wrapper that calls into the GPU - # function. fadd is the generated host wrapper function, it contains - # a reference to the generated device function internally. - - fadd = tvm.build(s, [A, B, C], target=tgt_gpu, name="myadd") - - ################################################################################ - # The compiled TVM function exposes a concise C API that can be invoked from - # any language. - # - # We provide a minimal array API in python to aid quick testing and prototyping. - # The array API is based on the `DLPack `_ standard. - # - # - We first create a GPU device. - # - Then tvm.nd.array copies the data to the GPU. - # - ``fadd`` runs the actual computation - # - ``numpy()`` copies the GPU array back to the CPU (so we can verify correctness). - # - # Note that copying the data to and from the memory on the GPU is a required step. - - dev = tvm.device(tgt_gpu.kind.name, 0) - - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - fadd(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - - ################################################################################ - # Inspect the Generated GPU Code - # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # You can inspect the generated code in TVM. The result of tvm.build is a TVM - # Module. fadd is the host module that contains the host wrapper, it also - # contains a device module for the CUDA (GPU) function. - # - # The following code fetches the device module and prints the content code. - - if ( - tgt_gpu.kind.name == "cuda" - or tgt_gpu.kind.name == "rocm" - or tgt_gpu.kind.name.startswith("opencl") - ): - dev_module = fadd.imported_modules[0] - print("-----GPU code-----") - print(dev_module.get_source()) - else: - print(fadd.get_source()) - -################################################################################ -# Saving and Loading Compiled Modules -# ----------------------------------- -# Besides runtime compilation, we can save the compiled modules into a file and -# load them back later. -# -# The following code first performs the following steps: -# -# - It saves the compiled host module into an object file. -# - Then it saves the device module into a ptx file. -# - cc.create_shared calls a compiler (gcc) to create a shared library - -from tvm.contrib import cc -from tvm.contrib import utils - -temp = utils.tempdir() -fadd.save(temp.relpath("myadd.o")) -if tgt.kind.name == "cuda": - fadd.imported_modules[0].save(temp.relpath("myadd.ptx")) -if tgt.kind.name == "rocm": - fadd.imported_modules[0].save(temp.relpath("myadd.hsaco")) -if tgt.kind.name.startswith("opencl"): - fadd.imported_modules[0].save(temp.relpath("myadd.cl")) -cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")]) -print(temp.listdir()) - -################################################################################ -# .. admonition:: Module Storage Format -# -# The CPU (host) module is directly saved as a shared library (.so). There -# can be multiple customized formats of the device code. In our example, the -# device code is stored in ptx, as well as a meta data json file. They can be -# loaded and linked separately via import. - -################################################################################ -# Load Compiled Module -# ~~~~~~~~~~~~~~~~~~~~ -# We can load the compiled module from the file system and run the code. The -# following code loads the host and device module separately and links them -# together. We can verify that the newly loaded function works. - -fadd1 = tvm.runtime.load_module(temp.relpath("myadd.so")) -if tgt.kind.name == "cuda": - fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.ptx")) - fadd1.import_module(fadd1_dev) - -if tgt.kind.name == "rocm": - fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco")) - fadd1.import_module(fadd1_dev) - -if tgt.kind.name.startswith("opencl"): - fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl")) - fadd1.import_module(fadd1_dev) - -fadd1(a, b, c) -tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - -################################################################################ -# Pack Everything into One Library -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# In the above example, we store the device and host code separately. TVM also -# supports export everything as one shared library. Under the hood, we pack -# the device modules into binary blobs and link them together with the host -# code. Currently we support packing of Metal, OpenCL and CUDA modules. - -fadd.export_library(temp.relpath("myadd_pack.so")) -fadd2 = tvm.runtime.load_module(temp.relpath("myadd_pack.so")) -fadd2(a, b, c) -tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - -################################################################################ -# .. admonition:: Runtime API and Thread-Safety -# -# The compiled modules of TVM do not depend on the TVM compiler. Instead, -# they only depend on a minimum runtime library. The TVM runtime library -# wraps the device drivers and provides thread-safe and device agnostic calls -# into the compiled functions. -# -# This means that you can call the compiled TVM functions from any thread, on -# any GPUs, provided that you have compiled the code for that GPU. - -################################################################################ -# Generate OpenCL Code -# -------------------- -# TVM provides code generation features into multiple backends. We can also -# generate OpenCL code or LLVM code that runs on CPU backends. -# -# The following code blocks generate OpenCL code, creates array on an OpenCL -# device, and verifies the correctness of the code. - -if tgt.kind.name.startswith("opencl"): - fadd_cl = tvm.build(s, [A, B, C], tgt, name="myadd") - print("------opencl code------") - print(fadd_cl.imported_modules[0].get_source()) - dev = tvm.cl(0) - n = 1024 - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) - fadd_cl(a, b, c) - tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy()) - -################################################################################ -# .. admonition:: TE Scheduling Primitives -# -# TVM includes a number of different scheduling primitives: -# -# - split: splits a specified axis into two axises by the defined factor. -# - tile: tiles will split a computation across two axes by the defined factors. -# - fuse: fuses two consecutive axises of one computation. -# - reorder: can reorder the axises of a computation into a defined order. -# - bind: can bind a computation to a specific thread, useful in GPU programming. -# - compute_at: by default, TVM will compute tensors at the outermost level -# of the function, or the root, by default. compute_at specifies that one -# tensor should be computed at the first axis of computation for another -# operator. -# - compute_inline: when marked inline, a computation will be expanded then -# inserted into the address where the tensor is required. -# - compute_root: moves a computation to the outermost layer, or root, of the -# function. This means that stage of the computation will be fully computed -# before it moves on to the next stage. -# -# A complete description of these primitives can be found in the -# :ref:`Schedule Primitives ` docs page. - -################################################################################ -# Example 2: Manually Optimizing Matrix Multiplication with TE -# ------------------------------------------------------------ -# -# Now we will consider a second, more advanced example, demonstrating how with -# just 18 lines of python code TVM speeds up a common matrix multiplication operation by 18x. -# -# **Matrix multiplication is a compute intensive operation. There are -# two important optimizations for good CPU performance:** -# -# 1. Increase the cache hit rate of memory access. Both complex -# numerical computation and hot-spot memory access can be -# accelerated by a high cache hit rate. This requires us to -# transform the origin memory access pattern to a pattern that fits -# the cache policy. -# -# 2. SIMD (Single instruction multi-data), also known as the vector -# processing unit. On each cycle instead of processing a single -# value, SIMD can process a small batch of data. This requires us -# to transform the data access pattern in the loop body in uniform -# pattern so that the LLVM backend can lower it to SIMD. -# -# The techniques used in this tutorial are a subset of tricks mentioned in this -# `repository `_. Some of them -# have been applied by TVM abstraction automatically, but some of them cannot -# be automatically applied due to TVM constraints. - -################################################################################ -# Preparation and Performance Baseline -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# We begin by collecting performance data on the `numpy` implementation of -# matrix multiplication. - -import tvm -import tvm.testing -from tvm import te -import numpy - -# The size of the matrix -# (M, K) x (K, N) -# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL. -M = 1024 -K = 1024 -N = 1024 - -# The default tensor data type in tvm -dtype = "float32" - -# You will want to adjust the target to match any CPU vector extensions you -# might have. For example, if you're using using Intel AVX2 (Advanced Vector -# Extensions) ISA for SIMD, you can get the best performance by changing the -# following line to ``llvm -mcpu=core-avx2``, or specific type of CPU you use. -# Recall that you're using llvm, you can get this information from the command -# ``llc --version`` to get the CPU type, and you can check ``/proc/cpuinfo`` -# for additional extensions that your processor might support. - -target = tvm.target.Target(target="llvm", host="llvm") -dev = tvm.device(target.kind.name, 0) - -# Random generated tensor for testing -a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), dev) -b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), dev) - -# Repeatedly perform a matrix multiplication to get a performance baseline -# for the default numpy implementation -np_repeat = 100 -np_running_time = timeit.timeit( - setup="import numpy\n" - "M = " + str(M) + "\n" - "K = " + str(K) + "\n" - "N = " + str(N) + "\n" - 'dtype = "float32"\n' - "a = numpy.random.rand(M, K).astype(dtype)\n" - "b = numpy.random.rand(K, N).astype(dtype)\n", - stmt="answer = numpy.dot(a, b)", - number=np_repeat, -) -print("Numpy running time: %f" % (np_running_time / np_repeat)) - -answer = numpy.dot(a.numpy(), b.numpy()) - -################################################################################ -# Now we write a basic matrix multiplication using TVM TE and verify that it -# produces the same results as the numpy implementation. We also write a -# function that will help us measure the performance of the schedule -# optimizations. - -# TVM Matrix Multiplication using TE -k = te.reduce_axis((0, K), "k") -A = te.placeholder((M, K), name="A") -B = te.placeholder((K, N), name="B") -C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C") - -# Default schedule -s = te.create_schedule(C.op) -func = tvm.build(s, [A, B, C], target=target, name="mmult") - -c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) -func(a, b, c) -tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - - -def evaluate_operation(s, vars, target, name, optimization, log): - func = tvm.build(s, [A, B, C], target=target, name="mmult") - assert func - - c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev) - func(a, b, c) - tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5) - - evaluator = func.time_evaluator(func.entry_name, dev, number=10) - mean_time = evaluator(a, b, c).mean - print("%s: %f" % (optimization, mean_time)) - log.append((optimization, mean_time)) - - -log = [] - -evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="none", log=log) - -################################################################################ -# Let's take a look at the intermediate representation of the operator and -# default schedule using the TVM lower function. Note how the implementation is -# essentially a naive implementation of a matrix multiplication, using three -# nested loops over the indices of the A and B matrices. - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 1: Blocking -# ~~~~~~~~~~~~~~~~~~~~~~~~ -# -# A important trick to enhance the cache hit rate is blocking, where you -# structure memory access such that the inside a block is a small neighborhood -# that has high memory locality. In this tutorial, we pick a block factor of -# 32. This will result in a block that will fill a 32 * 32 * sizeof(float) area -# of memory. This corresponds to a cache size of 4KB, in relation to a -# reference cache size of 32 KB for L1 cache. -# -# We begin by creating a default schedule for the ``C`` operation, then apply a -# ``tile`` scheduling primitive to it with the specified block factor, with the -# scheduling primitive returning the resulting loop order from outermost to -# innermost, as a vector ``[x_outer, y_outer, x_inner, y_inner]``. We then get -# the reduction axis for output of the operation, and perform a split operation -# on it using a factor of 4. This factor doesn't directly impact the blocking -# optimization we're working on right now, but will be useful later when we -# apply vectorization. -# -# Now that the operation has been blocked, we can reorder the computation to -# put the reduction operation into the outermost loop of the computation, -# helping to guarantee that the blocked data remains in cache. This completes -# the schedule, and we can build and test the performance compared to the naive -# schedule. - -bn = 32 - -# Blocking by loop tiling -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(k,) = s[C].op.reduce_axis -ko, ki = s[C].split(k, factor=4) - -# Hoist reduction domain outside the blocking loop -s[C].reorder(xo, yo, ko, ki, xi, yi) - -evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="blocking", log=log) - -################################################################################ -# By reordering the computation to take advantage of caching, you should see a -# significant improvement in the performance of the computation. Now, print the -# internal representation and compare it to the original: - -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 2: Vectorization -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Another important optimization trick is vectorization. When the memory access -# pattern is uniform, the compiler can detect this pattern and pass the -# continuous memory to the SIMD vector processor. In TVM, we can use the -# ``vectorize`` interface to hint the compiler this pattern, taking advantage -# of this hardware feature. -# -# In this tutorial, we chose to vectorize the inner loop row data since it is -# already cache friendly from our previous optimizations. - -# Apply the vectorization optimization -s[C].vectorize(yi) - -evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="vectorization", log=log) - -# The generalized IR after vectorization -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 3: Loop Permutation -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# If we look at the above IR, we can see the inner loop row data is vectorized -# and B is transformed into PackedB (this is evident by the `(float32x32*)B2` -# portion of the inner loop). The traversal of PackedB is sequential now. So we -# will look at the access pattern of A. In current schedule, A is accessed -# column by column which is not cache friendly. If we change the nested loop -# order of `ki` and inner axes `xi`, the access pattern for A matrix will be -# more cache friendly. - -s = te.create_schedule(C.op) -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(k,) = s[C].op.reduce_axis -ko, ki = s[C].split(k, factor=4) - -# re-ordering -s[C].reorder(xo, yo, ko, xi, ki, yi) -s[C].vectorize(yi) - -evaluate_operation( - s, [A, B, C], target=target, name="mmult", optimization="loop permutation", log=log -) - -# Again, print the new generalized IR -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 4: Array Packing -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Another important trick is array packing. This trick is to reorder the -# storage dimension of the array to convert the continuous access pattern on -# certain dimension to a sequential pattern after flattening. -# -# .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png -# :align: center -# -# Just as it is shown in the figure above, after blocking the computations, we -# can observe the array access pattern of B (after flattening), which is -# regular but discontinuous. We expect that after some transformation we can -# get a continuous access pattern. By reordering a ``[16][16]`` array to a -# ``[16/4][16][4]`` array the access pattern of B will be sequential when -# grabbing the corresponding value from the packed array. -# -# To accomplish this, we are going to have to start with a new default -# schedule, taking into account the new packing of B. It's worth taking a -# moment to comment on this: TE is a powerful and expressive language for -# writing optimized operators, but it often requires some knowledge of the -# underlying algorithm, data structures, and hardware target that you are -# writing for. Later in the tutorial, we will discuss some of the options for -# letting TVM take that burden. Regardless, let's move on with the new -# optimized schedule. - -# We have to re-write the algorithm slightly. -packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB") -C = te.compute( - (M, N), - lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k), - name="C", -) - -s = te.create_schedule(C.op) - -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(k,) = s[C].op.reduce_axis -ko, ki = s[C].split(k, factor=4) - -s[C].reorder(xo, yo, ko, xi, ki, yi) -s[C].vectorize(yi) - -x, y, z = s[packedB].op.axis -s[packedB].vectorize(z) -s[packedB].parallel(x) - -evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="array packing", log=log) - -# Here is the generated IR after array packing. -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 5: Optimizing Block Writing Through Caching -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Up to this point all of our optimizations have focused on efficiently -# accessing and computing the data from the `A` and `B` matrices to compute the -# `C` matrix. After the blocking optimization, the operator will write result -# to `C` block by block, and the access pattern is not sequential. We can -# address this by using a sequential cache array, using a combination of -# `cache_write`, `compute_at`, and `unroll`to hold the block results and write -# to `C` when all the block results are ready. - -s = te.create_schedule(C.op) - -# Allocate write cache -CC = s.cache_write(C, "global") - -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) - -# Write cache is computed at yo -s[CC].compute_at(s[C], yo) - -# New inner axes -xc, yc = s[CC].op.axis - -(k,) = s[CC].op.reduce_axis -ko, ki = s[CC].split(k, factor=4) -s[CC].reorder(ko, xc, ki, yc) -s[CC].unroll(ki) -s[CC].vectorize(yc) - -x, y, z = s[packedB].op.axis -s[packedB].vectorize(z) -s[packedB].parallel(x) - -evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="block caching", log=log) - -# Here is the generated IR after write cache blocking. -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Optimization 6: Parallelization -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# So far, our computation is only designed to use a single core. Nearly all -# modern processors have multiple cores, and computation can benefit from -# running computations in parallel. The final optimization is to take advantage -# of thread-level parallelization. - -# parallel -s[C].parallel(xo) - -x, y, z = s[packedB].op.axis -s[packedB].vectorize(z) -s[packedB].parallel(x) - -evaluate_operation( - s, [A, B, C], target=target, name="mmult", optimization="parallelization", log=log -) - -# Here is the generated IR after parallelization. -print(tvm.lower(s, [A, B, C], simple_mode=True)) - -################################################################################ -# Summary of Matrix Multiplication Example -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# After applying the above simple optimizations with only 18 lines of code, our -# generated code can begin to approach the performance of `numpy` with the Math -# Kernel Library (MKL). Since we've been logging the performance as we've been -# working, we can compare the results. - -baseline = log[0][1] -print("%s\t%s\t%s" % ("Operator".rjust(20), "Timing".rjust(20), "Performance".rjust(20))) -for result in log: - print( - "%s\t%s\t%s" - % (result[0].rjust(20), str(result[1]).rjust(20), str(result[1] / baseline).rjust(20)) - ) - -################################################################################ -# Note that the outputs on the web page reflect the running times on a -# non-exclusive Docker container, and should be considered unreliable. It is -# highly encouraged to run the tutorial by yourself to observe the performance -# gain achieved by TVM, and to carefully work through each example to -# understand the iterative improvements that are made to the matrix -# multiplication operation. - -################################################################################ -# Final Notes and Summary -# ----------------------- -# As mentioned earlier, how to apply optimizations using TE and scheduling -# primitives can require some knowledge of the underlying architecture and -# algorithms. However, TE was designed to act as a foundation for more complex -# algorithms that can search the potential optimization. With the knowledge you -# have from this introduction to TE, we can now begin to explore how TVM can -# automate the schedule optimization process. -# -# This tutorial provided a walk-through of TVM Tensor Expression (TE) workflow -# using a vector add and a matrix multiplication examples. The general workflow -# is -# -# - Describe your computation via a series of operations. -# - Describe how we want to compute use schedule primitives. -# - Compile to the target function we want. -# - Optionally, save the function to be loaded later. -# -# Upcoming tutorials expand on the matrix multiplication example, and show how -# you can build generic templates of the matrix multiplication and other -# operations with tunable parameters that allows you to automatically optimize -# the computation for specific platforms. diff --git a/gallery/tutorial/tensor_ir_blitz_course.py b/gallery/tutorial/tensor_ir_blitz_course.py deleted file mode 100644 index 346dc6154f9b..000000000000 --- a/gallery/tutorial/tensor_ir_blitz_course.py +++ /dev/null @@ -1,194 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tir_blitz: - -Blitz Course to TensorIR -======================== -**Author**: `Siyuan Feng `_ - -TensorIR is a domain specific language for deep learning programs serving two broad purposes: - -- An implementation for transforming and optimizing programs on various hardware backends. - -- An abstraction for automatic _tensorized_ program optimization. - -""" - -# sphinx_gallery_start_ignore -# sphinx_gallery_requires_cuda = True -# sphinx_gallery_end_ignore -import tvm -from tvm.ir.module import IRModule -from tvm.script import tir as T -import numpy as np - -################################################################################################ -# IRModule -# -------- -# An IRModule is the central data structure in TVM, which contains deep learning programs. -# It is the basic object of interest of IR transformation and model building. -# -# .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_life_of_irmodule.png -# :align: center -# :width: 85% -# -# This is the life cycle of an IRModule, which can be created from TVMScript. TensorIR schedule -# primitives and passes are two major ways to transform an IRModule. Also, a sequence of -# transformations on an IRModule is acceptable. Note that we can print an IRModule at **ANY** stage -# to TVMScript. After all transformations and optimizations are complete, we can build the IRModule -# to a runnable module to deploy on target devices. -# -# Based on the design of TensorIR and IRModule, we are able to create a new programming method: -# -# 1. Write a program by TVMScript in a python-AST based syntax. -# -# 2. Transform and optimize a program with python api. -# -# 3. Interactively inspect and try the performance with an imperative style transformation API. - - -################################################################################################ -# Create an IRModule -# ------------------ -# IRModule can be created by writing TVMScript, which is a round-trippable syntax for TVM IR. -# -# Different than creating a computational expression by Tensor Expression -# (:ref:`tutorial-tensor-expr-get-started`), TensorIR allow users to program through TVMScript, -# a language embedded in python AST. The new method makes it possible to write complex programs -# and further schedule and optimize it. -# -# Following is a simple example for vector addition. -# - - -@tvm.script.ir_module -class MyModule: - @T.prim_func - def main(a: T.handle, b: T.handle): - # We exchange data between function by handles, which are similar to pointer. - T.func_attr({"global_symbol": "main", "tir.noalias": True}) - # Create buffer from handles. - A = T.match_buffer(a, (8,), dtype="float32") - B = T.match_buffer(b, (8,), dtype="float32") - for i in range(8): - # A block is an abstraction for computation. - with T.block("B"): - # Define a spatial block iterator and bind it to value i. - vi = T.axis.spatial(8, i) - B[vi] = A[vi] + 1.0 - - -ir_module = MyModule -print(type(ir_module)) -print(ir_module.script()) - -################################################################################################ -# Besides, we can also use tensor expression DSL to write simple operators, and convert them -# to an IRModule. -# - -from tvm import te - -A = te.placeholder((8,), dtype="float32", name="A") -B = te.compute((8,), lambda *i: A(*i) + 1.0, name="B") -func = te.create_prim_func([A, B]) -ir_module_from_te = IRModule({"main": func}) -print(ir_module_from_te.script()) - - -################################################################################################ -# Build and Run an IRModule -# ------------------------- -# We can build the IRModule into a runnable module with specific target backends. -# - -mod = tvm.build(ir_module, target="llvm") # The module for CPU backends. -print(type(mod)) - -################################################################################################ -# Prepare the input array and output array, then run the module. -# - -a = tvm.nd.array(np.arange(8).astype("float32")) -b = tvm.nd.array(np.zeros((8,)).astype("float32")) -mod(a, b) -print(a) -print(b) - - -################################################################################################ -# Transform an IRModule -# --------------------- -# The IRModule is the central data structure for program optimization, which can be transformed -# by :code:`Schedule`. -# A schedule contains multiple primitive methods to interactively transform the program. -# Each primitive transforms the program in certain ways to bring additional performance optimizations. -# -# .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_tensor_ir_opt_flow.png -# :align: center -# :width: 100% -# -# The image above is a typical workflow for optimizing a tensor program. First, we need to create a -# schedule on the initial IRModule created from either TVMScript or Tensor Expression. Then, a -# sequence of schedule primitives will help to improve the performance. And at last, we can lower -# and build it into a runnable module. -# -# Here we just demonstrate a very simple transformation. First we create schedule on the input `ir_module`. - -sch = tvm.tir.Schedule(ir_module) -print(type(sch)) - -################################################################################################ -# Tile the loop into 3 loops and print the result. - -# Get block by its name -block_b = sch.get_block("B") -# Get loops surrounding the block -(i,) = sch.get_loops(block_b) -# Tile the loop nesting. -i_0, i_1, i_2 = sch.split(i, factors=[2, 2, 2]) -print(sch.mod.script()) - - -################################################################################################ -# We can also reorder the loops. Now we move loop `i_2` to outside of `i_1`. -sch.reorder(i_0, i_2, i_1) -print(sch.mod.script()) - - -################################################################################################ -# Transform to a GPU program -# ~~~~~~~~~~~~~~~~~~~~~~~~~~ -# If we want to deploy models on GPUs, thread binding is necessary. Fortunately, we can -# also use primitives and do incrementally transformation. -# - -sch.bind(i_0, "blockIdx.x") -sch.bind(i_2, "threadIdx.x") -print(sch.mod.script()) - - -################################################################################################ -# After binding the threads, now build the IRModule with :code:`cuda` backends. -ctx = tvm.cuda(0) -cuda_mod = tvm.build(sch.mod, target="cuda") -cuda_a = tvm.nd.array(np.arange(8).astype("float32"), ctx) -cuda_b = tvm.nd.array(np.zeros((8,)).astype("float32"), ctx) -cuda_mod(cuda_a, cuda_b) -print(cuda_a) -print(cuda_b) diff --git a/gallery/tutorial/tvmc_command_line_driver.py b/gallery/tutorial/tvmc_command_line_driver.py deleted file mode 100644 index 58a8dc212d9f..000000000000 --- a/gallery/tutorial/tvmc_command_line_driver.py +++ /dev/null @@ -1,525 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Compiling and Optimizing a Model with TVMC -========================================== -**Authors**: -`Leandro Nunes `_, -`Matthew Barrett `_, -`Chris Hoge `_ - -In this section, we will work with TVMC, the TVM command line driver. TVMC is a -tool that exposes TVM features such as auto-tuning, compiling, profiling and -execution of models through a command line interface. - -Upon completion of this section, we will have used TVMC to accomplish the -following tasks: - -* Compile a pre-trained ResNet-50 v2 model for the TVM runtime. -* Run a real image through the compiled model, and interpret the output and - model performance. -* Tune the model on a CPU using TVM. -* Re-compile an optimized model using the tuning data collected by TVM. -* Run the image through the optimized model, and compare the output and model - performance. - -The goal of this section is to give you an overview of TVM and TVMC's -capabilities, and set the stage for understanding how TVM works. -""" - - -################################################################################ -# Using TVMC -# ---------- -# -# TVMC is a Python application, part of the TVM Python package. -# When you install TVM using a Python package, you will get TVMC -# as a command line application called ``tvmc``. The location of this command -# will vary depending on your platform and installation method. -# -# Alternatively, if you have TVM as a Python module on your -# ``$PYTHONPATH``, you can access the command line driver functionality -# via the executable python module, ``python -m tvm.driver.tvmc``. -# -# For simplicity, this tutorial will mention TVMC command line using -# ``tvmc ``, but the same results can be obtained with -# ``python -m tvm.driver.tvmc ``. -# -# You can check the help page using: -# -# .. code-block:: bash -# -# tvmc --help -# -# The main features of TVM available to ``tvmc`` are from subcommands -# ``compile``, and ``run``, and ``tune``. To read about specific options under -# a given subcommand, use ``tvmc --help``. We will cover each of -# these commands in this tutorial, but first we need to download a pre-trained -# model to work with. -# - - -################################################################################ -# Obtaining the Model -# ------------------- -# -# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a -# convolutional neural network that is 50 layers deep and designed to classify -# images. The model we will be using has been pre-trained on more than a -# million images with 1000 different classifications. The network has an input -# image size of 224x224. If you are interested exploring more of how the -# ResNet-50 model is structured, we recommend downloading `Netron -# `_, a freely available ML model viewer. -# -# For this tutorial we will be using the model in ONNX format. -# -# .. code-block:: bash -# -# wget https://github.com/onnx/models/raw/b9a54e89508f101a1611cd64f4ef56b9cb62c7cf/vision/classification/resnet/model/resnet50-v2-7.onnx -# - -################################################################################ -# .. admonition:: Supported model formats -# -# TVMC supports models created with Keras, ONNX, TensorFlow, TFLite -# and Torch. Use the option ``--model-format`` if you need to -# explicitly provide the model format you are using. See ``tvmc -# compile --help`` for more information. -# - -################################################################################ -# .. admonition:: Adding ONNX Support to TVM -# -# TVM relies on the ONNX python library being available on your system. You can -# install ONNX using the command ``pip3 install --user onnx onnxoptimizer``. You -# may remove the ``--user`` option if you have root access and want to install -# ONNX globally. The ``onnxoptimizer`` dependency is optional, and is only used -# for ``onnx>=1.9``. -# - -################################################################################ -# Compiling an ONNX Model to the TVM Runtime -# ------------------------------------------ -# -# Once we've downloaded the ResNet-50 model, the next step is to compile it. To -# accomplish that, we are going to use ``tvmc compile``. The output we get from -# the compilation process is a TAR package of the model compiled to a dynamic -# library for our target platform. We can run that model on our target device -# using the TVM runtime. -# -# .. code-block:: bash -# -# # This may take several minutes depending on your machine -# tvmc compile \ -# --target "llvm" \ -# --input-shapes "data:[1,3,224,224]" \ -# --output resnet50-v2-7-tvm.tar \ -# resnet50-v2-7.onnx -# -# Let's take a look at the files that ``tvmc compile`` creates in the module: -# -# .. code-block:: bash -# -# mkdir model -# tar -xvf resnet50-v2-7-tvm.tar -C model -# ls model -# -# You will see three files listed. -# -# * ``mod.so`` is the model, represented as a C++ library, that can be loaded -# by the TVM runtime. -# * ``mod.json`` is a text representation of the TVM Relay computation graph. -# * ``mod.params`` is a file containing the parameters for the pre-trained -# model. -# -# This module can be directly loaded by your application, and the model can be -# run via the TVM runtime APIs. - - -################################################################################ -# .. admonition:: Defining the Correct Target -# -# Specifying the correct target (option ``--target``) can have a huge -# impact on the performance of the compiled module, as it can take -# advantage of hardware features available on the target. For more -# information, please refer to :ref:`Auto-tuning a convolutional network for -# x86 CPU `. We recommend identifying which CPU you are -# running, along with optional features, and set the target appropriately. - -################################################################################ -# Running the Model from The Compiled Module with TVMC -# ---------------------------------------------------- -# -# Now that we've compiled the model to this module, we can use the TVM runtime -# to make predictions with it. TVMC has the TVM runtime built in to it, -# allowing you to run compiled TVM models. To use TVMC to run the model and -# make predictions, we need two things: -# -# - The compiled module, which we just produced. -# - Valid input to the model to make predictions on. -# -# Each model is particular when it comes to expected tensor shapes, formats and -# data types. For this reason, most models require some pre and -# post-processing, to ensure the input is valid and to interpret the output. -# TVMC has adopted NumPy's ``.npz`` format for both input and output data. This -# is a well-supported NumPy format to serialize multiple arrays into a file. -# -# As input for this tutorial, we will use the image of a cat, but you can feel -# free to substitute this image for any of your choosing. -# -# .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg -# :height: 224px -# :width: 224px -# :align: center - - -################################################################################ -# Input pre-processing -# ~~~~~~~~~~~~~~~~~~~~ -# -# For our ResNet-50 v2 model, the input is expected to be in ImageNet format. -# Here is an example of a script to pre-process an image for ResNet-50 v2. -# -# You will need to have a supported version of the Python Image Library -# installed. You can use ``pip3 install --user pillow`` to satisfy this -# requirement for the script. -# -# .. code-block:: python -# :caption: preprocess.py -# :name: preprocess.py -# -# #!python ./preprocess.py -# from tvm.contrib.download import download_testdata -# from PIL import Image -# import numpy as np -# -# img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" -# img_path = download_testdata(img_url, "imagenet_cat.png", module="data") -# -# # Resize it to 224x224 -# resized_image = Image.open(img_path).resize((224, 224)) -# img_data = np.asarray(resized_image).astype("float32") -# -# # ONNX expects NCHW input, so convert the array -# img_data = np.transpose(img_data, (2, 0, 1)) -# -# # Normalize according to ImageNet -# imagenet_mean = np.array([0.485, 0.456, 0.406]) -# imagenet_stddev = np.array([0.229, 0.224, 0.225]) -# norm_img_data = np.zeros(img_data.shape).astype("float32") -# for i in range(img_data.shape[0]): -# norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i] -# -# # Add batch dimension -# img_data = np.expand_dims(norm_img_data, axis=0) -# -# # Save to .npz (outputs imagenet_cat.npz) -# np.savez("imagenet_cat", data=img_data) -# - -################################################################################ -# Running the Compiled Module -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# With both the model and input data in hand, we can now run TVMC to make a -# prediction: -# -# .. code-block:: bash -# -# tvmc run \ -# --inputs imagenet_cat.npz \ -# --output predictions.npz \ -# resnet50-v2-7-tvm.tar -# -# Recall that the ``.tar`` model file includes a C++ library, a description of -# the Relay model, and the parameters for the model. TVMC includes the TVM -# runtime, which can load the model and make predictions against input. When -# running the above command, TVMC outputs a new file, ``predictions.npz``, that -# contains the model output tensors in NumPy format. -# -# In this example, we are running the model on the same machine that we used -# for compilation. In some cases we might want to run it remotely via an RPC -# Tracker. To read more about these options please check ``tvmc run --help``. - -################################################################################ -# Output Post-Processing -# ~~~~~~~~~~~~~~~~~~~~~~ -# -# As previously mentioned, each model will have its own particular way of -# providing output tensors. -# -# In our case, we need to run some post-processing to render the outputs from -# ResNet-50 v2 into a more human-readable form, using the lookup-table provided -# for the model. -# -# The script below shows an example of the post-processing to extract labels -# from the output of our compiled module. -# -# .. code-block:: python -# :caption: postprocess.py -# :name: postprocess.py -# -# #!python ./postprocess.py -# import os.path -# import numpy as np -# -# from scipy.special import softmax -# -# from tvm.contrib.download import download_testdata -# -# # Download a list of labels -# labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt" -# labels_path = download_testdata(labels_url, "synset.txt", module="data") -# -# with open(labels_path, "r") as f: -# labels = [l.rstrip() for l in f] -# -# output_file = "predictions.npz" -# -# # Open the output and read the output tensor -# if os.path.exists(output_file): -# with np.load(output_file) as data: -# scores = softmax(data["output_0"]) -# scores = np.squeeze(scores) -# ranks = np.argsort(scores)[::-1] -# -# for rank in ranks[0:5]: -# print("class='%s' with probability=%f" % (labels[rank], scores[rank])) -# -# Running this script should produce the following output: -# -# .. code-block:: bash -# -# python postprocess.py -# # class='n02123045 tabby, tabby cat' with probability=0.610553 -# # class='n02123159 tiger cat' with probability=0.367179 -# # class='n02124075 Egyptian cat' with probability=0.019365 -# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 -# # class='n04040759 radiator' with probability=0.000261 -# -# Try replacing the cat image with other images, and see what sort of -# predictions the ResNet model makes. - -################################################################################ -# Automatically Tuning the ResNet Model -# ------------------------------------- -# -# The previous model was compiled to work on the TVM runtime, but did not -# include any platform specific optimization. In this section, we will show you -# how to build an optimized model using TVMC to target your working platform. -# -# In some cases, we might not get the expected performance when running -# inferences using our compiled module. In cases like this, we can make use of -# the auto-tuner, to find a better configuration for our model and get a boost -# in performance. Tuning in TVM refers to the process by which a model is -# optimized to run faster on a given target. This differs from training or -# fine-tuning in that it does not affect the accuracy of the model, but only -# the runtime performance. As part of the tuning process, TVM will try running -# many different operator implementation variants to see which perform best. -# The results of these runs are stored in a tuning records file, which is -# ultimately the output of the ``tune`` subcommand. -# -# In the simplest form, tuning requires you to provide three things: -# -# - the target specification of the device you intend to run this model on -# - the path to an output file in which the tuning records will be stored, and -# finally -# - a path to the model to be tuned. -# -# The example below demonstrates how that works in practice: -# -# .. code-block:: bash -# -# # The default search algorithm requires xgboost, see below for further -# # details on tuning search algorithms -# pip install xgboost -# -# tvmc tune \ -# --target "llvm" \ -# --output resnet50-v2-7-autotuner_records.json \ -# resnet50-v2-7.onnx -# -# In this example, you will see better results if you indicate a more specific -# target for the ``--target`` flag. For example, on an Intel i7 processor you -# could use ``--target llvm -mcpu=skylake``. For this tuning example, we are -# tuning locally on the CPU using LLVM as the compiler for the specified -# achitecture. -# -# TVMC will perform a search against the parameter space for the model, trying -# out different configurations for operators and choosing the one that runs -# fastest on your platform. Although this is a guided search based on the CPU -# and model operations, it can still take several hours to complete the search. -# The output of this search will be saved to the -# ``resnet50-v2-7-autotuner_records.json`` file, which will later be used to -# compile an optimized model. -# -# .. admonition:: Defining the Tuning Search Algorithm -# -# By default this search is guided using an ``XGBoost Grid`` algorithm. -# Depending on your model complexity and amount of time avilable, you might -# want to choose a different algorithm. A full list is available by -# consulting ``tvmc tune --help``. -# -# The output will look something like this for a consumer-level Skylake CPU: -# -# .. code-block:: bash -# -# tvmc tune \ -# --target "llvm -mcpu=broadwell" \ -# --output resnet50-v2-7-autotuner_records.json \ -# resnet50-v2-7.onnx -# # [Task 1/24] Current/Best: 9.65/ 23.16 GFLOPS | Progress: (60/1000) | 130.74 s Done. -# # [Task 1/24] Current/Best: 3.56/ 23.16 GFLOPS | Progress: (192/1000) | 381.32 s Done. -# # [Task 2/24] Current/Best: 13.13/ 58.61 GFLOPS | Progress: (960/1000) | 1190.59 s Done. -# # [Task 3/24] Current/Best: 31.93/ 59.52 GFLOPS | Progress: (800/1000) | 727.85 s Done. -# # [Task 4/24] Current/Best: 16.42/ 57.80 GFLOPS | Progress: (960/1000) | 559.74 s Done. -# # [Task 5/24] Current/Best: 12.42/ 57.92 GFLOPS | Progress: (800/1000) | 766.63 s Done. -# # [Task 6/24] Current/Best: 20.66/ 59.25 GFLOPS | Progress: (1000/1000) | 673.61 s Done. -# # [Task 7/24] Current/Best: 15.48/ 59.60 GFLOPS | Progress: (1000/1000) | 953.04 s Done. -# # [Task 8/24] Current/Best: 31.97/ 59.33 GFLOPS | Progress: (972/1000) | 559.57 s Done. -# # [Task 9/24] Current/Best: 34.14/ 60.09 GFLOPS | Progress: (1000/1000) | 479.32 s Done. -# # [Task 10/24] Current/Best: 12.53/ 58.97 GFLOPS | Progress: (972/1000) | 642.34 s Done. -# # [Task 11/24] Current/Best: 30.94/ 58.47 GFLOPS | Progress: (1000/1000) | 648.26 s Done. -# # [Task 12/24] Current/Best: 23.66/ 58.63 GFLOPS | Progress: (1000/1000) | 851.59 s Done. -# # [Task 13/24] Current/Best: 25.44/ 59.76 GFLOPS | Progress: (1000/1000) | 534.58 s Done. -# # [Task 14/24] Current/Best: 26.83/ 58.51 GFLOPS | Progress: (1000/1000) | 491.67 s Done. -# # [Task 15/24] Current/Best: 33.64/ 58.55 GFLOPS | Progress: (1000/1000) | 529.85 s Done. -# # [Task 16/24] Current/Best: 14.93/ 57.94 GFLOPS | Progress: (1000/1000) | 645.55 s Done. -# # [Task 17/24] Current/Best: 28.70/ 58.19 GFLOPS | Progress: (1000/1000) | 756.88 s Done. -# # [Task 18/24] Current/Best: 19.01/ 60.43 GFLOPS | Progress: (980/1000) | 514.69 s Done. -# # [Task 19/24] Current/Best: 14.61/ 57.30 GFLOPS | Progress: (1000/1000) | 614.44 s Done. -# # [Task 20/24] Current/Best: 10.47/ 57.68 GFLOPS | Progress: (980/1000) | 479.80 s Done. -# # [Task 21/24] Current/Best: 34.37/ 58.28 GFLOPS | Progress: (308/1000) | 225.37 s Done. -# # [Task 22/24] Current/Best: 15.75/ 57.71 GFLOPS | Progress: (1000/1000) | 1024.05 s Done. -# # [Task 23/24] Current/Best: 23.23/ 58.92 GFLOPS | Progress: (1000/1000) | 999.34 s Done. -# # [Task 24/24] Current/Best: 17.27/ 55.25 GFLOPS | Progress: (1000/1000) | 1428.74 s Done. -# -# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to customize your tuning -# process, in terms of number of repetitions (``--repeat`` and ``--number``, for example), the tuning -# algorithm to be used, and so on. Check ``tvmc tune --help`` for more information. -# -# In some situations it might be a good idea, to only tune specific tasks (i.e. the most relevant ones) -# to waste less time tuning simpler workworloads. The flag `--task` offers versatile options to limt -# the tasks used for tuning, e.g. `--task 20,22` or `--task 16-`. All available tasks can be printed -# using `--task list`. -# - -################################################################################ -# Compiling an Optimized Model with Tuning Data -# ---------------------------------------------- -# -# As an output of the tuning process above, we obtained the tuning records -# stored in ``resnet50-v2-7-autotuner_records.json``. This file can be used in -# two ways: -# -# - As input to further tuning (via ``tvmc tune --tuning-records``). -# - As input to the compiler -# -# The compiler will use the results to generate high performance code for the -# model on your specified target. To do that we can use ``tvmc compile -# --tuning-records``. Check ``tvmc compile --help`` for more information. -# -# Now that tuning data for the model has been collected, we can re-compile the -# model using optimized operators to speed up our computations. -# -# .. code-block:: bash -# -# tvmc compile \ -# --target "llvm" \ -# --tuning-records resnet50-v2-7-autotuner_records.json \ -# --output resnet50-v2-7-tvm_autotuned.tar \ -# resnet50-v2-7.onnx -# -# Verify that the optimized model runs and produces the same results: -# -# .. code-block:: bash -# -# tvmc run \ -# --inputs imagenet_cat.npz \ -# --output predictions.npz \ -# resnet50-v2-7-tvm_autotuned.tar -# -# python postprocess.py -# -# Verifying that the predictions are the same: -# -# .. code-block:: bash -# -# # class='n02123045 tabby, tabby cat' with probability=0.610550 -# # class='n02123159 tiger cat' with probability=0.367181 -# # class='n02124075 Egyptian cat' with probability=0.019365 -# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 -# # class='n04040759 radiator' with probability=0.000261 - -################################################################################ -# Comparing the Tuned and Untuned Models -# -------------------------------------- -# -# TVMC gives you tools for basic performance benchmarking between the models. -# You can specify a number of repetitions and that TVMC report on the model run -# time (independent of runtime startup). We can get a rough idea of how much -# tuning has improved the model performance. For example, on a test Intel i7 -# system, we see that the tuned model runs 47% faster than the untuned model: -# -# .. code-block:: bash -# -# tvmc run \ -# --inputs imagenet_cat.npz \ -# --output predictions.npz \ -# --print-time \ -# --repeat 100 \ -# resnet50-v2-7-tvm_autotuned.tar -# -# # Execution time summary: -# # mean (ms) max (ms) min (ms) std (ms) -# # 92.19 115.73 89.85 3.15 -# -# tvmc run \ -# --inputs imagenet_cat.npz \ -# --output predictions.npz \ -# --print-time \ -# --repeat 100 \ -# resnet50-v2-7-tvm.tar -# -# # Execution time summary: -# # mean (ms) max (ms) min (ms) std (ms) -# # 193.32 219.97 185.04 7.11 -# - - -################################################################################ -# Final Remarks -# ------------- -# -# In this tutorial, we presented TVMC, a command line driver for TVM. We -# demonstrated how to compile, run, and tune a model. We also discussed the -# need for pre and post-processing of inputs and outputs. After the tuning -# process, we demonstrated how to compare the performance of the unoptimized -# and optimize models. -# -# Here we presented a simple example using ResNet-50 v2 locally. However, TVMC -# supports many more features including cross-compilation, remote execution and -# profiling/benchmarking. -# -# To see what other options are available, please have a look at ``tvmc -# --help``. -# -# In the `next tutorial `, we introduce the Python interface to TVM, -# and in the tutorial after that, -# `Compiling and Optimizing a Model with the Python Interface `, -# we will cover the same compilation and optimization steps using the Python -# interface. diff --git a/gallery/tutorial/tvmc_python.py b/gallery/tutorial/tvmc_python.py deleted file mode 100644 index a92c3af626f0..000000000000 --- a/gallery/tutorial/tvmc_python.py +++ /dev/null @@ -1,293 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Getting Starting using TVMC Python: a high-level API for TVM -============================================================= -**Author**: -`Jocelyn Shiue `_ - -Hi! Here we explain the scripting tool designed for the complete TVM beginner. 🙂 - -Before we get started let's get an example model if you don't already have one. -Follow the steps to download a resnet model via the terminal: - - .. code-block:: python - - mkdir myscripts - cd myscripts - wget https://github.com/onnx/models/raw/b9a54e89508f101a1611cd64f4ef56b9cb62c7cf/vision/classification/resnet/model/resnet50-v2-7.onnx - mv resnet50-v2-7.onnx my_model.onnx - touch tvmcpythonintro.py - -Let's start editing the python file in your favorite text editor. -""" - - -################################################################################ -# Step 0: Imports -# ~~~~~~~~~~~~~~~ -# -# .. code-block:: python -# -# from tvm.driver import tvmc -# -# - -################################################################################ -# Step 1: Load a model -# ~~~~~~~~~~~~~~~~~~~~ -# -# Let's import our model into tvmc. This step converts a machine learning model from -# a supported framework into TVM's high level graph representation language called Relay. -# This is to have a unified starting point for all models in tvm. The frameworks we currently -# support are: Keras, ONNX, Tensorflow, TFLite, and PyTorch. -# -# .. code-block:: python -# -# model = tvmc.load('my_model.onnx') #Step 1: Load -# -# If you'd like to see the Relay, you can run: -# ``model.summary()`` -# -# All frameworks support overwriting the input shapes with a shape_dict argument. -# For most frameworks this is optional, but for Pytorch this is necessary as -# TVM cannot automatically search for it. -# -# .. code-block:: python -# -# #model = tvmc.load('my_model.onnx', shape_dict={'input1' : [1, 2, 3, 4], 'input2' : [1, 2, 3, 4]}) #Step 1: Load + shape_dict -# -# A suggested way to see the model's input/shape_dict is via `netron `_. After opening the model, -# click the first node to see the name(s) and shape(s) in the inputs section. - - -################################################################################ -# Step 2: Compile -# ~~~~~~~~~~~~~~~ -# -# Now that our model is in Relay, our next step is to compile it to a desired -# hardware to run on. We refer to this hardware as a target. This compilation process -# translates the model from Relay into a lower-level language that the -# target machine can understand. -# -# In order to compile a model a tvm.target string is required. -# To learn more about tvm.targets and their options look at the `documentation `_. -# Some examples include: -# -# 1. cuda (Nvidia GPU) -# 2. llvm (CPU) -# 3. llvm -mcpu=cascadelake (Intel CPU) -# -# .. code-block:: python -# -# package = tvmc.compile(model, target="llvm") #Step 2: Compile -# -# -# The compilation step returns a package. -# - -################################################################################ -# Step 3: Run -# ~~~~~~~~~~~ -# -# The compiled package can now be run on the hardware target. The device -# input options are: CPU, Cuda, CL, Metal, and Vulkan. -# -# .. code-block:: python -# -# result = tvmc.run(package, device="cpu") #Step 3: Run -# -# And you can print the results: -# ``print(result)`` -# - -################################################################################ -# Step 1.5: Tune [Optional & Recommended] -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# Run speed can further be improved by tuning. This optional step uses -# machine learning to look at each operation within a model (a function) and -# tries to find a faster way to run it. We do this through a cost model, and -# benchmarking possible schedules. -# -# The target is the same as compile. -# -# .. code-block:: python -# -# tvmc.tune(model, target="llvm") #Step 1.5: Optional Tune -# -# The terminal output should look like: -# -# .. code-block:: python -# -# [Task 1/13] Current/Best: 82.00/ 106.29 GFLOPS | Progress: (48/769) | 18.56 s -# [Task 1/13] Current/Best: 54.47/ 113.50 GFLOPS | Progress: (240/769) | 85.36 s -# ..... -# -# There may be UserWarnings that can be ignored. -# This should make the end result faster, but it can take hours to tune. -# -# See the section 'Saving the Tuning Results' below. Be sure to pass the tuning -# results into compile if you want the results to apply. -# -# .. code-block:: python -# -# #tvmc.compile(model, target="llvm", tuning_records = "records.log") #Step 2: Compile - -################################################################################ -# Save and then start the process in the terminal: -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# .. code-block:: python -# -# python my_tvmc_script.py -# -# Note: Your fans may become very active -# - -################################################################################ -# Example results: -# ~~~~~~~~~~~~~~~~ -# -# .. code-block:: python -# -# Time elapsed for training: 18.99 s -# Execution time summary: -# mean (ms) max (ms) min (ms) std (ms) -# 25.24 26.12 24.89 0.38 -# -# -# Output Names: -# ['output_0'] -# - - -################################################################################ -# Additional TVMC Functionalities -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# - -################################################################################ -# Saving the model -# ~~~~~~~~~~~~~~~~ -# -# To make things faster for later, after loading the model (Step 1) save the Relay version. -# The model will then appear where you saved it for later in the coverted syntax. -# -# .. code-block:: python -# -# model = tvmc.load('my_model.onnx') #Step 1: Load -# model.save(desired_model_path) -# -# - -################################################################################ -# Saving the package -# ~~~~~~~~~~~~~~~~~~ -# -# After the model has been compiled (Step 2) the package also is also saveable. -# -# .. code-block:: python -# -# tvmc.compile(model, target="llvm", package_path="whatever") #Step 2: Compile -# -# new_package = tvmc.TVMCPackage(package_path="whatever") -# result = tvmc.run(new_package, device="cpu") #Step 3: Run -# -# - -################################################################################ -# Using Autoscheduler -# ~~~~~~~~~~~~~~~~~~~ -# -# Use the next generation of tvm to enable potentially faster run speed results. -# The search space of the schedules is automatically generated unlike -# previously where they needed to be hand written. (Learn more: -# `1 `_, -# `2 `_) -# -# .. code-block:: python -# -# tvmc.tune(model, target="llvm", enable_autoscheduler = True) -# -# - -################################################################################ -# Saving the tuning results -# ~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# The tuning results can be saved in a file for later reuse. -# -# Method 1: -# .. code-block:: python -# -# log_file = "hello.json" -# -# # Run tuning -# tvmc.tune(model, target="llvm", tuning_records=log_file) -# -# ... -# -# # Later run tuning and reuse tuning results -# tvmc.tune(model, target="llvm", prior_records=log_file) -# -# Method 2: -# .. code-block:: python -# -# # Run tuning -# tuning_records = tvmc.tune(model, target="llvm") -# -# ... -# -# # Later run tuning and reuse tuning results -# tvmc.tune(model, target="llvm", prior_records=tuning_records) -# - -################################################################################ -# Tuning a more complex model: -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# If you notice T's printing that look like ``.........T.T..T..T..T.T.T.T.T.T.`` -# increase the searching time frame: -# -# .. code-block:: python -# -# tvmc.tune(model,trials=10000,timeout=10,) -# - -################################################################################ -# Compiling a model for a remote device: -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# -# A remote procedural call (RPC) is useful when you would like to compile for hardware -# that is not on your local machine. The tvmc methods support this. -# To set up the RPC server take a look at the 'Set up RPC Server on Device' -# section in this `document `_. -# -# Within the TVMC Script include the following and adjust accordingly: -# -# .. code-block:: python -# -# tvmc.tune( -# model, -# target=target, # Compilation target as string // Device to compile for -# target_host=target_host, # Host processor -# hostname=host_ip_address, # The IP address of an RPC tracker, used when benchmarking remotely. -# port=port_number, # The port of the RPC tracker to connect to. Defaults to 9090. -# rpc_key=your_key, # The RPC tracker key of the target device. Required when rpc_tracker is provided -# ) -# diff --git a/gallery/tutorial/uma.py b/gallery/tutorial/uma.py deleted file mode 100644 index a62a8604535d..000000000000 --- a/gallery/tutorial/uma.py +++ /dev/null @@ -1,286 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _tutorial-uma: - -Making your Hardware Accelerator TVM-ready with UMA -=================================================== -**Authors**: `Michael J. Klaiber `_, `Christoph Gerum `_, -`Paul Palomero Bernardo `_ - -""" - - -###################################################################### -# This is an introductory tutorial to the **Universal Modular Accelerator Interface** (UMA). -# UMA provides an easy-to-use API to integrate new hardware accelerators into TVM. -# -# This tutorial gives you step-by-step guidance how to use UMA to -# make your hardware accelerator TVM-ready. -# While there is no one-fits-all solution for this problem, UMA targets to provide a stable and Python-only -# API to integrate a number of hardware accelerator classes into TVM. -# -# -# In this tutorial you will get to know the UMA API in three use cases of increasing complexity. -# In these use case the three mock-accelerators -# **Vanilla**, **Strawberry** and **Chocolate** are introduced and -# integrated into TVM using UMA. -# - - -###################################################################### -# Vanilla -# ------------- -# **Vanilla** is a simple accelerator consisting of a MAC array and has no internal memory. -# It is can ONLY process Conv2D layers, all other layers are executed on a CPU, that also orchestrates **Vanilla**. -# Both the CPU and Vanilla use a shared memory. -# - -###################################################################### -# .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/tutorial/uma_vanilla_block_diagram.png -# :width: 100% -# :alt: A block diagram of Vanilla -# - -###################################################################### -# **Vanilla** has a C interface ``vanilla_conv2dnchw(...)``` for carrying out a Conv2D operation (including same-padding), -# that accepts pointers to input feature map, weights and result, -# as well as the dimensions of `Conv2D`: `oc`, `iw`, `ih`, `ic`, `kh`, `kw`. -# -# .. code-block:: c++ -# -# int vanilla_conv2dnchw(float* ifmap, float* weights, float* result, int oc, int iw, int ih, int ic, int kh, int kw); - - -################################################################################ -# The script `uma_cli` creates code skeletons with API-calls into the UMA-API for new accelerators. -# -# For **Vanilla** we use it as follows: (``--tutorial vanilla`` adds all the additional files required for this part of the tutorial) -# -# .. code-block:: bash -# -# pip install inflection -# cd $TVM_HOME/apps/uma -# python uma_cli.py --add_hardware vanilla_accelerator --tutorial vanilla -# - -################################################################################ -# uma_cli.py generates these files in the directory ``vanilla_accelerator`` which we are going to revisit. -# -# .. code-block:: bash -# -# backend.py -# codegen.py -# conv2dnchw.cc -# passes.py -# patterns.py -# run.py -# strategies.py - - -################################################################################ -# Vanilla backend -# -# The generated backend for vanilla is found in `vanilla_accelerator/backend.py`: - -###################################################################### -# -# .. code-block:: python -# -# class VanillaAcceleratorBackend(UMABackend): -# """UMA backend for VanillaAccelerator.""" -# -# def __init__(self): -# super().__init__() -# -# self._register_pattern("conv2d", conv2d_pattern()) -# self._register_tir_pass(PassPhase.TIR_PHASE_0, VanillaAcceleratorConv2DPass()) -# self._register_codegen(fmt="c", includes=gen_includes) -# -# @property -# def target_name(self): -# return "vanilla_accelerator" - - -################################################################################ -# Define offloaded patterns -# -# To specify that `Conv2D` is offloaded to **Vanilla**, it is described as Relay dataflow pattern -# (`DFPattern `_) in `vanilla_accelerator/patterns.py` - - -################################################################################ -# -# .. code-block:: python -# -# def conv2d_pattern(): -# pattern = is_op("nn.conv2d")(wildcard(), wildcard()) -# pattern = pattern.has_attr({"strides": [1, 1]}) -# return pattern - - -################################################################################ -# To map **Conv2D** operations from the input graph to **Vanilla**'s -# low level function call ``vanilla_conv2dnchw(...)``, the TIR pass -# *VanillaAcceleratorConv2DPass* (that will be discussed later in this tutorial) -# is registered in `VanillaAcceleratorBackend`. - - -################################################################################ -# Codegen - -################################################################################ -# The file ``vanilla_accelerator/codegen.py`` defines static C-code that is added to the -# resulting C-Code generated by TVMś C-Codegen in ``gen_includes``. -# Here C-code is added to include **Vanilla**'s low level library``vanilla_conv2dnchw()``. -# -# .. code-block:: python -# -# def gen_includes() -> str: -# topdir = pathlib.Path(__file__).parent.absolute() -# -# includes = "" -# includes += f'#include "{topdir}/conv2dnchw.cc"' -# return includes - - -################################################################################ -# As shown above in `VanillaAcceleratorBackend` it is registered to UMA with -# the `self._register_codegen` -# -# .. code-block:: python -# -# self._register_codegen(fmt="c", includes=gen_includes) - - -########################################################### -# Building the Neural Network and run it on Vanilla -# -# To demonstrate UMA's functionality, we will generate C code for a single Conv2D layer and run it on -# the Vanilla accelerator. -# The file ``vanilla_accelerator/run.py`` provides a demo running a Conv2D layer -# making use of Vanilla's C-API. -# -# -# .. code-block:: python -# -# def main(): -# mod, inputs, output_list, runner = create_conv2d() -# -# uma_backend = VanillaAcceleratorBackend() -# uma_backend.register() -# mod = uma_backend.partition(mod) -# target = tvm.target.Target("vanilla_accelerator", host=tvm.target.Target("c")) -# -# export_directory = tvm.contrib.utils.tempdir(keep_for_debug=True).path -# print(f"Generated files are in {export_directory}") -# compile_and_run( -# AOTModel(module=mod, inputs=inputs, outputs=output_list), -# runner, -# interface_api="c", -# use_unpacked_api=True, -# target=target, -# test_dir=str(export_directory), -# ) -# -# -# main() - -############################################################ -# By running ``vanilla_accelerator/run.py`` the output files are generated in the model library format (MLF). -# - -########################################################### -# Output: -# -# .. code-block:: bash -# -# Generated files are in /tmp/tvm-debug-mode-tempdirs/2022-07-13T13-26-22___x5u76h0p/00000 - -########################################################### -# Let's examine the generated files: -# -# -# Output: -# -# .. code-block:: bash -# -# cd /tmp/tvm-debug-mode-tempdirs/2022-07-13T13-26-22___x5u76h0p/00000 -# cd build/ -# ls -1 -# -# codegen -# lib.tar -# metadata.json -# parameters -# runtime -# src - -########################################################### -# To evaluate the generated C code go to ``codegen/host/src/default_lib2.c`` -# -# .. code-block:: bash -# -# cd codegen/host/src/ -# ls -1 -# -# default_lib0.c -# default_lib1.c -# default_lib2.c -# - -########################################################### -# In `default_lib2.c` you can now see that the generated code calls -# into Vanilla's C-API and executes a Conv2D layer: -# -# .. code-block:: c++ -# -# TVM_DLL int32_t tvmgen_default_vanilla_accelerator_main_0(float* placeholder, float* placeholder1, float* conv2d_nchw, uint8_t* global_workspace_1_var) { -# vanilla_accelerator_conv2dnchw(placeholder, placeholder1, conv2d_nchw, 32, 14, 14, 32, 3, 3); -# return 0; -# } -# - - -########################################################### -# Strawberry -# --------------- -# Coming soon ... - -########################################################### -# Chocolate -# -------------- -# Coming soon ... -# - -###################################################################### -# Request for Community Input -# ----------------------------- -# If this tutorial **did not** fit to your accelerator, lease add your requirements to the UMA thread in -# the TVM discuss forum: `Link `_. -# We are eager to extend this tutorial to provide guidance on making further classes of AI hardware -# accelerators TVM-ready using the UMA interface. -# - -###################################################################### -# References -# ----------- -# [UMA-RFC] `UMA: Universal Modular Accelerator Interface `_, -# TVM RFC, June 2022. -# -# [DFPattern] `Pattern Matching in Relay `_ -# diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh index cfd7ef554b7b..49c203b7267a 100755 --- a/tests/scripts/task_python_docs.sh +++ b/tests/scripts/task_python_docs.sh @@ -119,12 +119,6 @@ fi clean_files -# prepare auto scheduler tutorials -rm -rf gallery/how_to/tune_with_auto_scheduler/*.json -rm -rf gallery/tutorial/*.json -cp -f gallery/how_to/tune_with_autoscheduler/ci_logs/*.json gallery/how_to/tune_with_autoscheduler -cp -f gallery/how_to/tune_with_autoscheduler/ci_logs/*.json gallery/tutorial - # cleanup stale log files find . -type f -path "*.log" | xargs rm -f