From 57338631208232c8e14f8a11e296cc2dc73d4ff1 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Tue, 15 Sep 2020 16:12:49 +0100
Subject: [PATCH 1/3] [BYOC][ACL] Support add operation

Added support for an "add" operation implemented via ACL
for fp32 and quantized uint8 data types
---
 .../tvm/relay/op/contrib/arm_compute_lib.py   |  20 +++
 .../contrib/arm_compute_lib/acl_runtime.cc    |  51 ++++++-
 .../contrib/test_arm_compute_lib/test_add.py  | 135 ++++++++++++++++++
 3 files changed, 200 insertions(+), 6 deletions(-)
 create mode 100644 tests/python/contrib/test_arm_compute_lib/test_add.py

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
index 77fdbbd4006c..0578de799323 100644
--- a/python/tvm/relay/op/contrib/arm_compute_lib.py
+++ b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -345,3 +345,23 @@ def maximum(attrs, args):
     type_a = args[0].checked_type
     type_b = args[0].checked_type
     return (type_a.dtype == "float32") and (type_b.dtype == "float32")
+
+
+@tvm.ir.register_op_attr("add", "target.arm_compute_lib")
+def add(attrs, args):
+    """Check if the external ACL codegen for add should be used."""
+    for typ in [args[0].checked_type, args[1].checked_type]:
+        if typ.dtype != "float32":
+            return False
+
+    return True
+
+
+@tvm.ir.register_op_attr("qnn.add", "target.arm_compute_lib")
+def qnn_add(attrs, args):
+    """Check if the external ACL codegen for add should be used."""
+    for typ in [args[0].checked_type, args[1].checked_type]:
+        if typ.dtype != "uint8":
+            return False
+
+    return True
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index 3fb4e0a3575f..26f18dc677d8 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -30,6 +30,7 @@
 
 #ifdef TVM_GRAPH_RUNTIME_ARM_COMPUTE_LIB
 #include <arm_compute/core/Types.h>
+#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
 #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEElementwiseOperations.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
@@ -142,6 +143,8 @@ class ACLRuntime : public JSONRuntimeBase {
           CreateReshapeLayer(&layer_, node);
         } else if ("maximum" == op_name) {
           CreateMaximumLayer(&layer_, node);
+        } else if ("add" == op_name || "qnn.add" == op_name) {
+          CreateAddLayer(&layer_, node);
         } else {
           LOG(FATAL) << "Unsupported op: " << op_name;
         }
@@ -417,6 +420,45 @@ class ACLRuntime : public JSONRuntimeBase {
     function->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0]);
     layer->function = function;
   }
+  /*!
+   * \brief Creates an add/qnn.add layer
+   *
+   * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function.
+   * \param node  The JSON representation of the operator.
+   */
+  void CreateAddLayer(CachedLayer* layer, const JSONGraphNode& node) {
+    auto op_name = node.GetOpName();
+    if ("add" == op_name) {
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0]));
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1]));
+      layer->outputs.push_back(MakeACLTensorFromJSONNode(node));
+    } else if ("qnn.add" == op_name) {
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[0], &node.GetInputs()[2],
+                                                         &node.GetInputs()[3]));
+      layer->inputs.push_back(MakeACLTensorFromJSONEntry(node.GetInputs()[1], &node.GetInputs()[4],
+                                                         &node.GetInputs()[5]));
+      layer->outputs.push_back(
+          MakeACLTensorFromJSONNode(node, &node.GetInputs()[6], &node.GetInputs()[7]));
+    } else {
+      throw std::runtime_error("Unsupported form of add op: " + op_name);
+    }
+
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]  input1 First tensor input. Data types supported: U8/QASYMM8/S16/F16/F32
+     * @param[in]  input2 Second tensor input. Data types supported: U8/QASYMM8/S16/F16/F32
+     * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/F32
+     * @param[in]  policy Policy to use to handle overflow.
+     * void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy);
+     *
+     * arm_compute::ConvertPolicy::SATURATE is used as add_QASYMM8_QASYMM8_QASYMM8 currently
+     * always saturates result
+     */
+    auto f = std::make_shared<arm_compute::NEArithmeticAddition>();
+    f->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0],
+                 arm_compute::ConvertPolicy::SATURATE);
+    layer->function = f;
+  }
 
   /*! \brief Allow ACL functions to request auxiliary memory from TVM. */
   ACLAllocator allocator_;
@@ -437,7 +479,6 @@ class ACLRuntime : public JSONRuntimeBase {
   }
 #endif
 };
-
 runtime::Module ACLRuntimeCreate(const String& symbol_name, const String& graph_json,
                                  const Array<String>& const_names) {
   auto n = make_object<ACLRuntime>(symbol_name, graph_json, const_names);
@@ -445,10 +486,8 @@ runtime::Module ACLRuntimeCreate(const String& symbol_name, const String& graph_
 }
 
 TVM_REGISTER_GLOBAL("runtime.arm_compute_lib_runtime_create").set_body_typed(ACLRuntimeCreate);
-
 TVM_REGISTER_GLOBAL("runtime.module.loadbinary_arm_compute_lib")
     .set_body_typed(JSONRuntimeBase::LoadFromBinary<ACLRuntime>);
-
-}  // namespace contrib
-}  // namespace runtime
-}  // namespace tvm
+}  //  namespace contrib
+}  //  namespace runtime
+}  //  namespace tvm
diff --git a/tests/python/contrib/test_arm_compute_lib/test_add.py b/tests/python/contrib/test_arm_compute_lib/test_add.py
new file mode 100644
index 000000000000..c2b68702b6c2
--- /dev/null
+++ b/tests/python/contrib/test_arm_compute_lib/test_add.py
@@ -0,0 +1,135 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Arm Compute Library integration reshape tests."""
+
+import numpy as np
+
+import tvm
+import tvm.testing
+from tvm import relay
+
+from test_arm_compute_lib.infrastructure import (
+    skip_runtime_test,
+    skip_codegen_test,
+    build_and_run,
+    verify,
+    verify_codegen,
+)
+from test_arm_compute_lib.infrastructure import Device
+
+_qnn_params = {
+    "lhs_scale": relay.const(0.0156863, "float32"),
+    "lhs_zero_point": relay.const(127, "int32"),
+    "rhs_scale": relay.const(0.0117647, "float32"),
+    "rhs_zero_point": relay.const(85, "int32"),
+    "output_scale": relay.const(0.0235294, "float32"),
+    "output_zero_point": relay.const(128, "int32"),
+}
+
+
+def _get_model(shape, dtype, var_names, op, op_params):
+    a = relay.var(next(var_names), shape=shape, dtype=dtype)
+    b = relay.var(next(var_names), shape=shape, dtype=dtype)
+    return op(a, b, **op_params)
+
+
+def _get_expected_codegen(shape, dtype, op_name, qnn_params):
+    input_a = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
+    input_b = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}}
+    input_qnn = [
+        {
+            "op": "const",
+            "name": "",
+            "attrs": {
+                "shape": [[list(qnn_params[_].data.shape)]],
+                "dtype": [[qnn_params[_].data.dtype]],
+            },
+        }
+        for _ in qnn_params
+    ]
+    inputs = [input_a, input_b, *input_qnn]
+    node = {
+        "op": "kernel",
+        "name": op_name,
+        "inputs": [[_, 0, 0] for _ in range(len(inputs))],
+        "attrs": {
+            "num_inputs": str(len(inputs)),
+            "num_outputs": "1",
+            "shape": [[list(shape)]],
+            "dtype": [[dtype]],
+        },
+    }
+
+    return [*inputs, node]
+
+
+def test_runtime_add():
+    Device.load("test_config.json")
+
+    if skip_runtime_test():
+        return
+
+    device = Device()
+    np.random.seed(0)
+
+    for dtype, low, high, atol, rtol, op, op_params in [
+        ("float32", -127, 128, 1e-7, 1e-7, relay.add, {}),
+        ("uint8", 0, 255, 0.0, 1.0, relay.qnn.op.add, _qnn_params),
+    ]:
+        shape = (2, 2)
+        for inputs in [
+            {
+                "a": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
+                "b": tvm.nd.array(np.random.uniform(low, high, shape).astype(dtype)),
+            }
+        ]:
+            outputs = []
+            func = _get_model(shape, dtype, iter(inputs), op, op_params)
+            for acl in [True, False]:
+                outputs.append(build_and_run(func, inputs, 1, None, device, enable_acl=acl)[0])
+
+            config = {
+                "shape": shape,
+                "dtype": dtype,
+                "inputs": inputs,
+                "operation": op,
+                "op_params": op_params,
+            }
+
+            # verify_saturation=False as the result of add_QASYMM8_QASYMM8_QASYMM8
+            # is always saturated currently.
+            verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=False)
+
+
+def test_runtime_codegen_add():
+    if skip_codegen_test():
+        return
+
+    inputs = {"a", "b"}
+    for dtype, op_name, op, qnn_params in [
+        ("float32", "add", relay.add, {}),
+        ("uint8", "qnn.add", relay.qnn.op.add, _qnn_params),
+    ]:
+        for shape in [(1, 1), (2, 2, 2), (3, 3, 3, 3)]:
+            func = _get_model(shape, dtype, iter(inputs), op, qnn_params)
+            exp_codegen = _get_expected_codegen(shape, dtype, op_name, qnn_params)
+            verify_codegen(func, exp_codegen, 1)
+
+
+if __name__ == "__main__":
+    test_runtime_codegen_add()
+    test_runtime_add()

From 0f0a904b5566ea05a21b68699d3ec6b3e06bbc3e Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Fri, 9 Oct 2020 21:05:01 +0100
Subject: [PATCH 2/3] Addressed lhutton1 comments

---
 docs/deploy/arm_compute_lib.rst                     |  5 ++++-
 src/runtime/contrib/arm_compute_lib/acl_runtime.cc  | 13 ++-----------
 .../python/contrib/test_arm_compute_lib/test_add.py |  6 ++----
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index 1ff034a2cd8d..7e544e4b2af2 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -234,7 +234,10 @@ Operator support
 +----------------------+-------------------------------------------------------------------------+
 | maximum              | fp32                                                                    |
 +----------------------+-------------------------------------------------------------------------+
-
+| add                  | fp32                                                                    |
++----------------------+-------------------------------------------------------------------------+
+| qnn.add              | uint8                                                                   |
++----------------------+-------------------------------------------------------------------------+
 .. note::
     A composite operator is a series of operators that map to a single Arm Compute Library operator. You can view this
     as being a single fused operator from the view point of Arm Compute Library. '?' denotes an optional operator in
diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
index 26f18dc677d8..e5f2c2d47281 100644
--- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
+++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc
@@ -443,18 +443,9 @@ class ACLRuntime : public JSONRuntimeBase {
       throw std::runtime_error("Unsupported form of add op: " + op_name);
     }
 
-    /** Initialise the kernel's inputs, output and conversion policy.
-     *
-     * @param[in]  input1 First tensor input. Data types supported: U8/QASYMM8/S16/F16/F32
-     * @param[in]  input2 Second tensor input. Data types supported: U8/QASYMM8/S16/F16/F32
-     * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/F16/F32
-     * @param[in]  policy Policy to use to handle overflow.
-     * void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy);
-     *
-     * arm_compute::ConvertPolicy::SATURATE is used as add_QASYMM8_QASYMM8_QASYMM8 currently
-     * always saturates result
-     */
     auto f = std::make_shared<arm_compute::NEArithmeticAddition>();
+
+    // SATURATE is used as add_QASYMM8_QASYMM8_QASYMM8 always saturates result
     f->configure(&layer->inputs[0], &layer->inputs[1], &layer->outputs[0],
                  arm_compute::ConvertPolicy::SATURATE);
     layer->function = f;
diff --git a/tests/python/contrib/test_arm_compute_lib/test_add.py b/tests/python/contrib/test_arm_compute_lib/test_add.py
index c2b68702b6c2..d7abc5c414fb 100644
--- a/tests/python/contrib/test_arm_compute_lib/test_add.py
+++ b/tests/python/contrib/test_arm_compute_lib/test_add.py
@@ -110,12 +110,10 @@ def test_runtime_add():
                 "op_params": op_params,
             }
 
-            # verify_saturation=False as the result of add_QASYMM8_QASYMM8_QASYMM8
-            # is always saturated currently.
             verify(outputs, atol=atol, rtol=rtol, config=config, verify_saturation=False)
 
 
-def test_runtime_codegen_add():
+def test_codegen_add():
     if skip_codegen_test():
         return
 
@@ -131,5 +129,5 @@ def test_runtime_codegen_add():
 
 
 if __name__ == "__main__":
-    test_runtime_codegen_add()
+    test_codegen_add()
     test_runtime_add()

From 05e0220623405631b6bc51c8d48f1b58a5f56645 Mon Sep 17 00:00:00 2001
From: Dmitriy Smirnov <dmitriy.smirnov@arm.com>
Date: Sat, 10 Oct 2020 13:13:47 +0100
Subject: [PATCH 3/3] linter

---
 docs/deploy/arm_compute_lib.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst
index 7e544e4b2af2..5dd00764bcbc 100644
--- a/docs/deploy/arm_compute_lib.rst
+++ b/docs/deploy/arm_compute_lib.rst
@@ -238,6 +238,7 @@ Operator support
 +----------------------+-------------------------------------------------------------------------+
 | qnn.add              | uint8                                                                   |
 +----------------------+-------------------------------------------------------------------------+
+
 .. note::
     A composite operator is a series of operators that map to a single Arm Compute Library operator. You can view this
     as being a single fused operator from the view point of Arm Compute Library. '?' denotes an optional operator in