Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for QLinearMul ONNX op #8773

Merged
merged 4 commits into from
Aug 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions python/tvm/relay/frontend/onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -3279,6 +3279,40 @@ def get_scalar(x, dtype="float32"):
return _qnn.op.quantize(out, c_scale, c_zero_point, out_dtype=dtype)


class QLinearMul(OnnxOpConverter):
"""Operator converter for QLinearMul from Microsoft onnxruntime contrib opset."""

@classmethod
def _impl_v10(cls, inputs, attr, params):
def get_scalar(x, dtype="float32"):
if isinstance(x, _expr.Var) and x.name_hint in params:
return _op.const(params[x.name_hint].numpy(), dtype)
rank = len(infer_shape(x))
assert rank <= 1, "QLinearMul scale and zero_point input must be scalars"
if rank == 1:
x = _op.squeeze(x, [0])
return _op.cast(x, dtype)

a = inputs[0]
a_scale = get_scalar(inputs[1])
a_zero_point = get_scalar(inputs[2], "int32")
b = inputs[3]
b_scale = get_scalar(inputs[4])
b_zero_point = get_scalar(inputs[5], "int32")
y_scale = fold_constant(get_scalar(inputs[6]))
y_zero_point = get_scalar(inputs[7], "int32")

dtype = infer_type(a).checked_type.dtype

## Onnxruntime doesn't actually do this op in integer, they dequantize to fp32
## and then requantize afer
## https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/mlas/lib/qlmul.cpp
Comment on lines +3307 to +3309
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😱

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be the ORT implementation of QLinearMul, not QLinearMatMul?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Translated everything to QLinearMul instead 🤔 , let me know if it looks right to you now

Comment on lines +3307 to +3309
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if onnxruntime is performing fp32 operations, is there any reason to do the same here? Wouldn't it be better (at least some what for the performance) to requantize both inputs 'a' and 'b' as per output scale and zero_point and then perform integer matmul?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have a pass (

def FakeQuantizationToInteger():
# pylint: disable=anomalous-backslash-in-string
"""
Find regions of the graph of the form
.. code-block:: text
x w
| |
dq dq
\ /
op1
|
op2
|
q
where ``q == qnn.quantize`` and ``dq = qnn.dequantize``
and rewrite them into integer versions of ``op1`` and ``op2``
Rules for rewriting indivdual ops are in fake_quantization_to_integer.py
Returns
-------
ret : tvm.transform.Pass
The registered SimplifyExpr pass.
"""
return _ffi_api.FakeQuantizationToInteger()
) To do exactly that if the user wants to accept that small accuracy difference in exchange fdor the speedup.

BTW - this is elementwise multiplication, not matmul

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank makes sense. Thanks!

a = _qnn.op.dequantize(inputs[0], a_scale, a_zero_point)
b = _qnn.op.dequantize(inputs[3], b_scale, b_zero_point)
out = _op.multiply(a, b)
return _qnn.op.quantize(out, y_scale, y_zero_point, out_dtype=dtype)


class ConvInteger(OnnxOpConverter):
"""Operator converter for ConvInteger."""

Expand Down Expand Up @@ -3605,6 +3639,7 @@ def _get_convert_map(opset):
"ReverseSequence": ReverseSequence.get_converter(opset),
"QLinearConv": QLinearConv.get_converter(opset),
"QLinearAdd": QLinearAdd.get_converter(opset),
"QLinearMul": QLinearMul.get_converter(opset),
"ConvInteger": ConvInteger.get_converter(opset),
# Random number generation.
"RandomUniform": RandomUniform.get_converter(opset),
Expand Down
95 changes: 67 additions & 28 deletions tests/python/frontend/onnx/test_forward.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,39 @@ def verify_with_ort(
)


def quantize_and_verify_with_ort(onnx_model, input_names, input_shapes, target, dev):
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType

input_arrays = [np.random.random(shape).astype("float32") for shape in input_shapes]

class RandomDataReader(CalibrationDataReader):
def __init__(self, n=10):
input_dict = dict(zip(input_names, input_shapes))
self.data = iter(
[
{
name: np.random.random(shape).astype("float32")
for name, shape in input_dict.items()
}
for _ in range(n)
]
)

def get_next(self):
return next(self.data, None)

d = tvm.contrib.utils.tempdir()
model_fp32 = os.path.join(d.temp_dir, "model.onnx")
onnx.save_model(onnx_model, model_fp32)
model_quant = os.path.join(d.temp_dir, "model.quant.onnx")
quantized_model = quantize_static(model_fp32, model_quant, RandomDataReader())
# opt_level=1 will cause error with qnn lowering
model = onnx.load(model_quant)
verify_with_ort_with_inputs(
model, input_arrays, opt_level=2, target=target, dev=dev, use_vm=True
)


def make_constant_node(name, data_type, dims, vals):
return helper.make_node(
"Constant",
Expand Down Expand Up @@ -5273,47 +5306,53 @@ def verify_qlinearadd(a_shape, b_shape, c_shape):
]
input_values = [a_array, b_array]

node = helper.make_node("QLinearAdd", inputs=input_names, outputs=["C"])

node = helper.make_node("Add", ["a", "b"], ["C"])
graph = helper.make_graph(
[node],
"qlinearadd_test",
inputs=input_nodes,
outputs=[helper.make_tensor_value_info("C", TensorProto.FLOAT, list(c_shape))],
)
model = helper.make_model(graph, producer_name="qlinearconv_test")
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType

class RandomDataReader(CalibrationDataReader):
def __init__(self, n=10):
self.data = iter(
[
{
"a": np.random.random(a_shape).astype("float32"),
"b": np.random.random(b_shape).astype("float32"),
}
for _ in range(n)
]
)

def get_next(self):
return next(self.data, None)

d = tvm.contrib.utils.tempdir()
model_fp32 = os.path.join(d.temp_dir, "model.onnx")
onnx.save_model(model, model_fp32)
model_quant = os.path.join(d.temp_dir, "model.quant.onnx")
quantized_model = quantize_static(model_fp32, model_quant, RandomDataReader())
# opt_level=1 will cause error with qnn lowering
model = onnx.load(model_quant)
verify_with_ort_with_inputs(model, input_values, opt_level=2, target=target, dev=dev)
model = helper.make_model(graph, producer_name="qlinearadd_test")
quantize_and_verify_with_ort(model, input_names, [a_shape, b_shape], target, dev)

verify_qlinearadd([4, 2], [4, 2], [4, 2])
verify_qlinearadd([4, 2], [2], [4, 2])
verify_qlinearadd([5, 1, 7], [2, 7], [5, 2, 7])


@tvm.testing.parametrize_targets
def test_qlinearmul(target, dev):
def verify_qlinearmul(a_shape, b_shape, c_shape):

a_array = np.random.random(a_shape).astype("float32")
b_array = np.random.random(b_shape).astype("float32")

input_nodes = [
helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)),
]
input_names = [
"a",
"b",
]
input_values = [a_array, b_array]

node = helper.make_node("Mul", input_names, ["C"])
graph = helper.make_graph(
[node],
"qlinearmul_test",
inputs=input_nodes,
outputs=[helper.make_tensor_value_info("C", TensorProto.FLOAT, list(c_shape))],
)
model = helper.make_model(graph, producer_name="qlinearmul_test")
quantize_and_verify_with_ort(model, input_names, [a_shape, b_shape], target, dev)

verify_qlinearmul([4, 2], [4, 2], [4, 2])
verify_qlinearmul([4, 2], [2], [4, 2])
verify_qlinearmul([5, 1, 7], [2, 7], [5, 2, 7])


@tvm.testing.parametrize_targets
def test_random_uniform(target, dev):
def get_random_uniform(shape, dtype="float32", high=1.0, low=0.0, seed=None):
Expand Down