From 57338631208232c8e14f8a11e296cc2dc73d4ff1 Mon Sep 17 00:00:00 2001 From: Dmitriy Smirnov Date: Tue, 15 Sep 2020 16:12:49 +0100 Subject: [PATCH 1/3] [BYOC][ACL] Support add operation Added support for an "add" operation implemented via ACL for fp32 and quantized uint8 data types --- .../tvm/relay/op/contrib/arm_compute_lib.py | 20 +++ .../contrib/arm_compute_lib/acl_runtime.cc | 51 ++++++- .../contrib/test_arm_compute_lib/test_add.py | 135 ++++++++++++++++++ 3 files changed, 200 insertions(+), 6 deletions(-) create mode 100644 tests/python/contrib/test_arm_compute_lib/test_add.py diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py index 77fdbbd4006c..0578de799323 100644 --- a/python/tvm/relay/op/contrib/arm_compute_lib.py +++ b/python/tvm/relay/op/contrib/arm_compute_lib.py @@ -345,3 +345,23 @@ def maximum(attrs, args): type_a = args[0].checked_type type_b = args[0].checked_type return (type_a.dtype == "float32") and (type_b.dtype == "float32") + + +@tvm.ir.register_op_attr("add", "target.arm_compute_lib") +def add(attrs, args): + """Check if the external ACL codegen for add should be used.""" + for typ in [args[0].checked_type, args[1].checked_type]: + if typ.dtype != "float32": + return False + + return True + + +@tvm.ir.register_op_attr("qnn.add", "target.arm_compute_lib") +def qnn_add(attrs, args): + """Check if the external ACL codegen for add should be used.""" + for typ in [args[0].checked_type, args[1].checked_type]: + if typ.dtype != "uint8": + return False + + return True diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc index 3fb4e0a3575f..26f18dc677d8 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc @@ -30,6 +30,7 @@ #ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB #include +#include #include #include #include @@ -142,6 +143,8 @@ class ACLRuntime : public JSONRuntimeBase { CreateReshapeLayer(&layer_, node); } else if ("maximum" == op_name) { CreateMaximumLayer(&layer_, node); + } else if ("add" == op_name || "qnn.add" == op_name) { + CreateAddLayer(&layer_, node); } else { LOG(FATAL) << "Unsupported op: " << op_name; } @@ -417,6 +420,45 @@ class ACLRuntime : public JSONRuntimeBase { function->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0]); layer->function = function; } + /*! + * \brief Creates an add/qnn.add layer + * + * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function. + * \param node The JSON representation of the operator. + */ + void CreateAddLayer(CachedLayer* layer, const JSONGraphNode& node) { + auto op_name = node.GetOpName(); + if ("add" == op_name) { + layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0])); + layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1])); + layer->outputs.push_back(MakeACLTensorFromJSONNode(node)); + } else if ("qnn.add" == op_name) { + layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0], &node.GetInputs()[2], + &node.GetInputs()[3])); + layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1], &node.GetInputs()[4], + &node.GetInputs()[5])); + layer->outputs.push_back( + MakeACLTensorFromJSONNode(node, &node.GetInputs()[6], &node.GetInputs()[7])); + } else { + throw std::runtime_error("Unsupported form of add op: " + op_name); + } + + /** Initialise the kernel's inputs, output and conversion policy. + * + * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/S16/F16/F32 + * @param[in] input2 Second tensor input. Data types supported: U8/QASYMM8/S16/F16/F32 + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/F32 + * @param[in] policy Policy to use to handle overflow. + * void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy); + * + * arm_compute::ConvertPolicy::SATURATE is used as add_QASYMM8_QASYMM8_QASYMM8 currently + * always saturates result + */ + auto f = std::make_shared(); + f->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0], + arm_compute::ConvertPolicy::SATURATE); + layer->function = f; + } /*! \brief Allow ACL functions to request auxiliary memory from TVM. */ ACLAllocator allocator_; @@ -437,7 +479,6 @@ class ACLRuntime : public JSONRuntimeBase { } #endif }; - runtime::Module ACLRuntimeCreate(const String& symbol_name, const String& graph_json, const Array& const_names) { auto n = make_object(symbol_name, graph_json, const_names); @@ -445,10 +486,8 @@ runtime::Module ACLRuntimeCreate(const String& symbol_name, const String& graph_ } TVM_REGISTER_GLOBAL("runtime.arm_compute_lib_runtime_create").set_body_typed(ACLRuntimeCreate); - TVM_REGISTER_GLOBAL("runtime.module.loadbinary_arm_compute_lib") .set_body_typed(JSONRuntimeBase::LoadFromBinary); - -} // namespace contrib -} // namespace runtime -} // namespace tvm +} // namespace contrib +} // namespace runtime +} // namespace tvm diff --git a/tests/python/contrib/test_arm_compute_lib/test_add.py b/tests/python/contrib/test_arm_compute_lib/test_add.py new file mode 100644 index 000000000000..c2b68702b6c2 --- /dev/null +++ b/tests/python/contrib/test_arm_compute_lib/test_add.py @@ -0,0 +1,135 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Arm Compute Library integration reshape tests.""" + +import numpy as np + +import tvm +import tvm.testing +from tvm import relay + +from test_arm_compute_lib.infrastructure import ( + skip_runtime_test, + skip_codegen_test, + build_and_run, + verify, + verify_codegen, +) +from test_arm_compute_lib.infrastructure import Device + +_qnn_params = { + "lhs_scale": relay.const(0.0156863, "float32"), + "lhs_zero_point": relay.const(127, "int32"), + "rhs_scale": relay.const(0.0117647, "float32"), + "rhs_zero_point": relay.const(85, "int32"), + "output_scale": relay.const(0.0235294, "float32"), + "output_zero_point": relay.const(128, "int32"), +} + + +def _get_model(shape, dtype, var_names, op, op_params): + a = relay.var(next(var_names), shape=shape, dtype=dtype) + b = relay.var(next(var_names), shape=shape, dtype=dtype) + return op(a, b, **op_params) + + +def _get_expected_codegen(shape, dtype, op_name, qnn_params): + input_a = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}} + input_b = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}} + input_qnn = [ + { + "op": "const", + "name": "", + "attrs": { + "shape": [[list(qnn_params[_].data.shape)]], + "dtype": [[qnn_params[_].data.dtype]], + }, + } + for _ in qnn_params + ] + inputs = [input_a, input_b, *input_qnn] + node = { + "op": "kernel", + "name": op_name, + "inputs": [[_, 0, 0] for _ in range(len(inputs))], + "attrs": { + "num_inputs": str(len(inputs)), + "num_outputs": "1", + "shape": [[list(shape)]], + "dtype": [[dtype]], + }, + } + + return [*inputs, node] + + +def test_runtime_add(): + Device.load("test_config.json") + + if skip_runtime_test(): + return + + device = Device() + np.random.seed(0) + + for dtype, low, high, atol, rtol, op, op_params in [ + ("float32", -127, 128, 1e-7, 1e-7, relay.add, {}), + ("uint8", 0, 255, 0.0, 1.0, relay.qnn.op.add, _qnn_params), + ]: + shape = (2, 2) + for inputs in [ + { + "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)), + "b": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)), + } + ]: + outputs = [] + func = _get_model(shape, dtype, iter(inputs), op, op_params) + for acl in [True, False]: + outputs.append(build_and_run(func, inputs, 1, None, device, enable_acl=acl)[0]) + + config = { + "shape": shape, + "dtype": dtype, + "inputs": inputs, + "operation": op, + "op_params": op_params, + } + + # verify_saturation=False as the result of add_QASYMM8_QASYMM8_QASYMM8 + # is always saturated currently. + verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=False) + + +def test_runtime_codegen_add(): + if skip_codegen_test(): + return + + inputs = {"a", "b"} + for dtype, op_name, op, qnn_params in [ + ("float32", "add", relay.add, {}), + ("uint8", "qnn.add", relay.qnn.op.add, _qnn_params), + ]: + for shape in [(1, 1), (2, 2, 2), (3, 3, 3, 3)]: + func = _get_model(shape, dtype, iter(inputs), op, qnn_params) + exp_codegen = _get_expected_codegen(shape, dtype, op_name, qnn_params) + verify_codegen(func, exp_codegen, 1) + + +if __name__ == "__main__": + test_runtime_codegen_add() + test_runtime_add() From 0f0a904b5566ea05a21b68699d3ec6b3e06bbc3e Mon Sep 17 00:00:00 2001 From: Dmitriy Smirnov Date: Fri, 9 Oct 2020 21:05:01 +0100 Subject: [PATCH 2/3] Addressed lhutton1 comments --- docs/deploy/arm_compute_lib.rst | 5 ++++- src/runtime/contrib/arm_compute_lib/acl_runtime.cc | 13 ++----------- .../python/contrib/test_arm_compute_lib/test_add.py | 6 ++---- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst index 1ff034a2cd8d..7e544e4b2af2 100644 --- a/docs/deploy/arm_compute_lib.rst +++ b/docs/deploy/arm_compute_lib.rst @@ -234,7 +234,10 @@ Operator support +----------------------+-------------------------------------------------------------------------+ | maximum | fp32 | +----------------------+-------------------------------------------------------------------------+ - +| add | fp32 | ++----------------------+-------------------------------------------------------------------------+ +| qnn.add | uint8 | ++----------------------+-------------------------------------------------------------------------+ .. note:: A composite operator is a series of operators that map to a single Arm Compute Library operator. You can view this as being a single fused operator from the view point of Arm Compute Library. '?' denotes an optional operator in diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc index 26f18dc677d8..e5f2c2d47281 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc @@ -443,18 +443,9 @@ class ACLRuntime : public JSONRuntimeBase { throw std::runtime_error("Unsupported form of add op: " + op_name); } - /** Initialise the kernel's inputs, output and conversion policy. - * - * @param[in] input1 First tensor input. Data types supported: U8/QASYMM8/S16/F16/F32 - * @param[in] input2 Second tensor input. Data types supported: U8/QASYMM8/S16/F16/F32 - * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/F32 - * @param[in] policy Policy to use to handle overflow. - * void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy); - * - * arm_compute::ConvertPolicy::SATURATE is used as add_QASYMM8_QASYMM8_QASYMM8 currently - * always saturates result - */ auto f = std::make_shared(); + + // SATURATE is used as add_QASYMM8_QASYMM8_QASYMM8 always saturates result f->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0], arm_compute::ConvertPolicy::SATURATE); layer->function = f; diff --git a/tests/python/contrib/test_arm_compute_lib/test_add.py b/tests/python/contrib/test_arm_compute_lib/test_add.py index c2b68702b6c2..d7abc5c414fb 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_add.py +++ b/tests/python/contrib/test_arm_compute_lib/test_add.py @@ -110,12 +110,10 @@ def test_runtime_add(): "op_params": op_params, } - # verify_saturation=False as the result of add_QASYMM8_QASYMM8_QASYMM8 - # is always saturated currently. verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=False) -def test_runtime_codegen_add(): +def test_codegen_add(): if skip_codegen_test(): return @@ -131,5 +129,5 @@ def test_runtime_codegen_add(): if __name__ == "__main__": - test_runtime_codegen_add() + test_codegen_add() test_runtime_add() From 05e0220623405631b6bc51c8d48f1b58a5f56645 Mon Sep 17 00:00:00 2001 From: Dmitriy Smirnov Date: Sat, 10 Oct 2020 13:13:47 +0100 Subject: [PATCH 3/3] linter --- docs/deploy/arm_compute_lib.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst index 7e544e4b2af2..5dd00764bcbc 100644 --- a/docs/deploy/arm_compute_lib.rst +++ b/docs/deploy/arm_compute_lib.rst @@ -238,6 +238,7 @@ Operator support +----------------------+-------------------------------------------------------------------------+ | qnn.add | uint8 | +----------------------+-------------------------------------------------------------------------+ + .. note:: A composite operator is a series of operators that map to a single Arm Compute Library operator. You can view this as being a single fused operator from the view point of Arm Compute Library. '?' denotes an optional operator in