Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[microTVM] Use QNN schedules to give SOTA performance #13752

Merged
merged 19 commits into from
Mar 2, 2023
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions include/tvm/relay/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,13 @@ TVM_DLL Pass RemoveUnusedFunctions(Array<runtime::String> entry_functions);
*/
TVM_DLL Pass SimplifyExpr();

/*!
* \brief Stripped down version of SimplifyExpr which is run after AlterOpLayout.
*
* \return The pass.
*/
TVM_DLL Pass SimplifyExprPostAlterOp();

/*!
* \brief Run any custom passes registered under "RelayToTIR" attributes on TargetKinds.
*
Expand Down
17 changes: 0 additions & 17 deletions python/tvm/relay/op/nn/_nn.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,23 +877,6 @@ def convert_deformable_conv2d(attrs, inputs, tinfos, desired_layouts):
return relay.nn.deformable_conv2d(data, offset, weight, **new_attrs)


# QNN ops
@reg.register_alter_op_layout("add")
def alter_op_layout_add(attrs, inputs, tinfos, out_type):
"""Alter the layout of a add op.

Useful for fusing the bias constant with an input zero point constant in a previous quantized
op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
"""
return topi.nn.qnn.qnn_add_alter_layout(attrs, inputs, tinfos, out_type)


@reg.register_alter_op_layout("qnn.requantize")
def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
"""Alter the layout of a requantization op."""
return topi.nn.qnn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)


# bitpack
@reg.register_compute("nn.bitpack")
def compute_bitpack(attrs, inputs, out_dtype):
Expand Down
60 changes: 58 additions & 2 deletions python/tvm/relay/qnn/op/_qnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,20 @@
# pylint: disable=invalid-name, unused-argument, len-as-condition
"""QNN operator feature registration"""

import numpy as np

from tvm import topi

from .. import strategy
from ...op.op import register_compute
from ...op.op import register_injective_schedule
from ...op.op import register_strategy, register_pattern, register_alter_op_layout, OpPattern
from ...op.op import (
OpPattern,
register_alter_op_layout,
register_legalize,
register_pattern,
register_strategy,
)


@register_compute("qnn.simulated_quantize")
Expand Down Expand Up @@ -85,12 +93,60 @@ def simulated_dequantize_compute(attrs, inputs, output_type):
register_strategy("qnn.conv2d", strategy.qnn_conv2d_strategy)


@register_legalize("clip")
def legalize_clip(attrs, inputs, tinfos):
guberti marked this conversation as resolved.
Show resolved Hide resolved
"""Removes clip operators with bounds matching the defaults for their dtype.

This is already done after alter_op by TVM's simplification passes, but certain QNN operator
implementations (like Cortex-M) need it to be done earlier in legalization.
"""

if hasattr(inputs[0], "op") and inputs[0].op.name == "qnn.requantize":
dtype_info = np.iinfo(tinfos[0].dtype)
if dtype_info.min == attrs.a_min and dtype_info.max == attrs.a_max:
return inputs[0]

return None


@register_legalize("nn.bias_add")
def legalize_bias_add(attrs, inputs, tinfos):
"""Legalize a bias add operator.

May be used to "fold in" unused channels from quantized convolution operators. This should
be done before layout rewrites occur to minimize the amount of "extra" overhead operators
like "cast" and "layout_transform".
"""
return topi.nn.bias_add_legalize(attrs, inputs, tinfos)


@register_alter_op_layout("qnn.conv2d")
def alter_op_layout_qnn_conv2d(attrs, inputs, tinfos, out_type):
"""Alternate the layout of qnn.conv2d"""
"""Alter the layout of a qnn conv2d op.

May be used to alter the current QNN Conv2D op, but can also be used to alter previous ops to
better match the current op. For example, Arm Cortex-M uses this to set the out_layout of
previous ops to the input layout preferred by future layouts.
"""
return topi.nn.qnn_conv2d_alter_layout(attrs, inputs, tinfos, out_type)


@register_alter_op_layout("add")
def alter_op_layout_add(attrs, inputs, tinfos, out_type):
"""Alter the layout of a add op.

Useful for fusing the bias constant with an input zero point constant in a previous quantized
op. Only used when previous op is a quantized op, which is why it lives in topi.nn.qnn.
"""
return topi.nn.add_alter_layout(attrs, inputs, tinfos, out_type)


@register_alter_op_layout("qnn.requantize")
def alter_op_layout_qnn_requantize(attrs, inputs, tinfos, out_type):
"""Alter the layout of a requantization op."""
return topi.nn.qnn_requantize_alter_layout(attrs, inputs, tinfos, out_type)


# qnn.dense
register_strategy("qnn.dense", strategy.qnn_dense_strategy)

Expand Down
73 changes: 67 additions & 6 deletions python/tvm/relay/qnn/strategy/arm_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,55 @@
regular/depthwise conv2d is supported, but qnn_dense will be added eventually."""

from tvm import topi, TVMError
from .generic import qnn_conv2d_strategy
from tvm.topi.utils import get_const_tuple
from ... import op as _op
from ...op.strategy.generic import is_depthwise_conv2d
from .generic import (
qnn_conv2d_strategy,
qnn_dense_strategy,
qnn_dequantize_strategy,
qnn_quantize_strategy,
wrap_compute_dequantize,
wrap_compute_quantize,
wrap_topi_qnn_dense,
wrap_topi_schedule,
)


@qnn_quantize_strategy.register("arm_cpu")
def qnn_quantize_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
"""qnn.quantize strategy for arm_cpu"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_quantize(topi.hexagon.qnn_quantize),
wrap_topi_schedule(topi.hexagon.schedule_qnn_quantize),
name="qnn_quantize.arm_cpu",
)
return strategy


@qnn_dequantize_strategy.register("arm_cpu")
def qnn_dequantize_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
"""qnn.dequantize strategy for arm_cpu"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_dequantize(topi.hexagon.qnn_dequantize),
wrap_topi_schedule(topi.hexagon.schedule_qnn_dequantize),
name="qnn_dequantize.arm_cpu",
)
return strategy


@qnn_dense_strategy.register("arm_cpu")
def qnn_dense_strategy_arm_cpu(_attrs, _inputs, _out_type, _target):
"""qnn.dense strategy for arm_cpu"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_topi_qnn_dense(topi.hexagon.qnn_dense),
wrap_topi_schedule(topi.hexagon.schedule_qnn_dense),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I see you reuse compute/schedule from Hexagon. These schedules are not optimized and have very naive implementation. Is it acceptable for you?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's fine for the time being. I know @mkatanbaf is working on a Cortex-M schedule for dense, but these operations do not take very much time on convolutional models.

name="qnn_dense.arm_cpu",
)
return strategy


@qnn_conv2d_strategy.register("arm_cpu")
Expand Down Expand Up @@ -59,13 +105,28 @@ def qnn_conv2d_strategy_arm_cpu(attrs, inputs, _out_type, target):
topi.arm_cpu.schedule_qnn_conv2d,
name="qnn_conv2d.arm_cpu",
)
else:
raise TVMError("QNN regular Conv2D for Arm Cortex-M DSP got incorrect input layout!")
elif is_depthwise_conv2d(data.shape, data_layout, kernel.shape, kernel_layout, groups):
if data_layout == "NCHW" and kernel_layout == "IOHW":
strategy.add_implementation(
topi.arm_cpu.qnn_depthwise_conv2d,
topi.arm_cpu.schedule_qnn_depthwise_conv2d,
name="qnn_depthwise_conv2d.arm_cpu",
)
height, width = data.shape[2:]
y_stride, x_stride = get_const_tuple(attrs.strides)
if height * width * y_stride % 2 == 0:
strategy.add_implementation(
topi.arm_cpu.qnn_depthwise_conv2d,
topi.arm_cpu.schedule_qnn_depthwise_conv2d,
name="qnn_depthwise_conv2d.arm_cpu",
)
elif y_stride == x_stride == 1:
strategy.add_implementation(
topi.arm_cpu.qnn_unrolled_depthwise_conv2d,
topi.arm_cpu.schedule_qnn_unrolled_depthwise_conv2d,
name="qnn_unrolled_depthwise_conv2d.arm_cpu",
)
else:
raise TVMError("No QNN depthwise Conv2D Cortex-M schedule supports these params!")
else:
raise TVMError("QNN depthwise Conv2D for Arm Cortex-M DSP got incorrect input layout!")
else:
raise TVMError("No Arm Cortex-M DSP strategy exists for generic group qnn.conv2d")

Expand Down
3 changes: 2 additions & 1 deletion python/tvm/topi/arm_cpu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@
from .conv2d_transpose import *
from .conv2d_int8 import *
from . import conv2d_alter_op
from . import qnn_alter_op
from .bitserial_conv2d import *
from .bitserial_dense import *
from .injective import *
from .group_conv2d import *
from .pooling import *
from .dense import *
from .qnn import *
from . import qnn_alter_op
from . import qnn_legalize
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,14 @@ def _write_sums_to_memory(num_outputs, offset, stride) -> Iterator[str]:
num_packed = (num_outputs - offset) // 2
for i in range(num_packed):
index = 2 * i + offset
yield f"int32_t packed_res_{i} = requant_{index} + (requant_{index + 1} << 16);"
# We must explicitly call asm inline to use the PKHBT instruction. It is not part of
# ACLE and has no __builtin. Writing it using masks and bitshifts does not work either:
# Arm GCC 12 with -O3 does not compile these efficiently.
yield f"int packed_res_{i};"
yield (
f'__asm__ ("pkhbt %0, %1, %2, lsl #16" : "=r" (packed_res_{i}) : '
f'"r" (requant_{index}), "r" (requant_{index + 1}));'
)

if offset == 1:
yield "((int16_t*) output)[1] = (int16_t) requant_0;"
Expand Down
Loading