From 5349732cd7e3416a86b567eb7946951bc98ae86f Mon Sep 17 00:00:00 2001 From: Manupa Karunaratne Date: Thu, 5 Aug 2021 11:28:52 +0100 Subject: [PATCH] Arm(R) Ethos(TM)-U NPU codegen integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit integrates the codegen for Arm® Ethos™-U. * Adding Conv2D tests and a mobilenet_v1 conv2d offload test. Co-authored-by: Grant Watson Co-authored-by: Leandro Nunes Co-authored-by: Christopher Sidebottom Change-Id: Iff3de833842be05ce3d3525efd8f9a301c8fbd27 --- .../relay/backend/contrib/ethosu/__init__.py | 1 + .../relay/backend/contrib/ethosu/codegen.py | 83 +++++ .../relay/backend/contrib/ethosu/legalize.py | 6 + .../tvm/relay/backend/contrib/ethosu/util.py | 12 + src/relay/backend/aot_executor_codegen.cc | 1 - .../backend/contrib/ethosu/source_module.cc | 320 ++++++++++++++++++ tests/python/contrib/test_ethosu/infra.py | 231 +++++++++++++ .../reference_system/arm-none-eabi-gcc.cmake | 79 +++++ .../test_ethosu/reference_system/ethosu_55.h | 27 ++ .../test_ethosu/reference_system/ethosu_mod.h | 59 ++++ .../test_ethosu/reference_system/hard_fault.h | 53 +++ .../contrib/test_ethosu/test_codegen.py | 174 ++++++++++ .../contrib/test_ethosu/test_networks.py | 65 ++++ tests/python/relay/aot/aot_test_utils.py | 200 ++++++++--- tests/python/relay/aot/corstone300.ld | 8 + tests/python/relay/aot/corstone300.mk | 23 +- tests/python/relay/aot/test_crt_aot.py | 11 +- 17 files changed, 1305 insertions(+), 48 deletions(-) create mode 100644 python/tvm/relay/backend/contrib/ethosu/codegen.py create mode 100644 src/relay/backend/contrib/ethosu/source_module.cc create mode 100644 tests/python/contrib/test_ethosu/reference_system/arm-none-eabi-gcc.cmake create mode 100644 tests/python/contrib/test_ethosu/reference_system/ethosu_55.h create mode 100644 tests/python/contrib/test_ethosu/reference_system/ethosu_mod.h create mode 100644 tests/python/contrib/test_ethosu/reference_system/hard_fault.h create mode 100644 tests/python/contrib/test_ethosu/test_codegen.py create mode 100644 tests/python/contrib/test_ethosu/test_networks.py diff --git a/python/tvm/relay/backend/contrib/ethosu/__init__.py b/python/tvm/relay/backend/contrib/ethosu/__init__.py index 2b424ebb5dec0..5fd1a0c19dc90 100644 --- a/python/tvm/relay/backend/contrib/ethosu/__init__.py +++ b/python/tvm/relay/backend/contrib/ethosu/__init__.py @@ -19,6 +19,7 @@ from . import legalize from . import preprocess from . import errors +from . import codegen from . import vela_api from . import tir_to_cs_translator from .util import partition_for_ethosu diff --git a/python/tvm/relay/backend/contrib/ethosu/codegen.py b/python/tvm/relay/backend/contrib/ethosu/codegen.py new file mode 100644 index 0000000000000..e821ea8bf0c4b --- /dev/null +++ b/python/tvm/relay/backend/contrib/ethosu/codegen.py @@ -0,0 +1,83 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Codegen for Arm(R) Ethos(TM)-U""" +import tvm +from tvm import relay +from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir +from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants +from tvm.relay.backend.contrib.ethosu.legalize import LegalizeEthosU +from tvm.relay.backend.contrib.ethosu import tir_to_cs_translator +from tvm.relay.backend.contrib.ethosu import util + + +@tvm._ffi.register_func("relay.ext.ethosu.constant_updater") +def constant_updater(expr, symbol): # pylint: disable=unused-argument + """ + We dont want the build process to extract constants to be loaded in + the runtime as we are embedding them inside the C runtime.Module. + """ + return dict() + + +@tvm._ffi.register_func("relay.ext.ethosu") +def ethosu_compiler(ref): + """Main function to a compile a given relay function of + NPU compatible operators to generated command stream. + Such generated command stream would be loaded to the runtime + module that interfaces with NPU driver. + """ + assert isinstance(ref, tvm.ir.function.BaseFunc) + func_name = ref.attrs["global_symbol"] + # There should only be a single input + assert len(ref.params) == 1 + input_size = util.calculate_size_bytes(ref.params[0]) + output_size = util.calculate_size_bytes(ref.body) + cmms, encoded_constants, scratch_size = _compile(ref) + ethosu_runtime = tvm._ffi.get_global_func("runtime.module.ethosu.create") + return ethosu_runtime(func_name, cmms, encoded_constants, scratch_size, input_size, output_size) + + +def _compile(ext_func): + """ + This is the main wrapper that accepts an external + relay function and runs all the passes to lower it down + to command stream + Parameters + ---------- + ext_func : tvm.relay.function.Function + The partitioned relay function + Returns + ------- + cs : str + An hex string of the bytes of command stream + encoded_constants : str + An hex string of the bytes that includes concat'd + encoded weights, encoded biases and scales. + scratch_size : int + The size of the scratch buffer needed. + """ + mod = tvm.IRModule() + mod["main"] = ext_func + mod = LegalizeEthosU()(mod) + mod = relay.transform.InferType()(mod) + # We are currently using copy_constants scheduler In the long run, + # this should be a single intelligent and a composite scheduler + # that can perform scheduling based on user inputs such as + # scratch memory size. + tir_mod, params = lower_to_tir(mod["main"], copy_constants()) + cmms, encoded_constants, scratch_size = tir_to_cs_translator.translate(tir_mod, params) + return cmms, encoded_constants, scratch_size diff --git a/python/tvm/relay/backend/contrib/ethosu/legalize.py b/python/tvm/relay/backend/contrib/ethosu/legalize.py index 82b7f1e68ceee..fd58da803623c 100644 --- a/python/tvm/relay/backend/contrib/ethosu/legalize.py +++ b/python/tvm/relay/backend/contrib/ethosu/legalize.py @@ -221,3 +221,9 @@ def transform_module( mod = LegalizeSplit()(mod) mod = LegalizeEthosUConv2D()(mod) return mod + + def __call__(self, *args, **kwargs): + # pylint is unable figure out the decorated + # class is callable, thus adding this to + # suppress the warning. + pass diff --git a/python/tvm/relay/backend/contrib/ethosu/util.py b/python/tvm/relay/backend/contrib/ethosu/util.py index 0919d3fe7a5f7..b5c2179b893b2 100644 --- a/python/tvm/relay/backend/contrib/ethosu/util.py +++ b/python/tvm/relay/backend/contrib/ethosu/util.py @@ -197,3 +197,15 @@ def get_dim_value(layout: str, dim: int): if dim_char == dim: return idx return None + + +def calculate_size_bytes(expr): + """This is a helper function to calculate the number + of bytes required to hold the tensor/relay.expr""" + try: + type_info = np.iinfo(expr.checked_type.dtype) + except ValueError: + type_info = np.finfo(expr.checked_type.dtype) + element_size = type_info.bits // 8 + elements = np.prod(list(expr.checked_type.shape)) + return element_size * elements diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 0c094cb1fa2ca..2738a4a75db5d 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -650,7 +650,6 @@ class AOTExecutorCodegen : public MixedModeVisitor { // Apply storage rewrite pass to the runner function to do memory planning auto storage_rewrite = tir::transform::StorageRewrite(); mod_run = storage_rewrite(mod_run); - // The workspace for main function should be calculated after performing storage_rewrite for // the top level TIR function. auto workspace_byte_alignment = diff --git a/src/relay/backend/contrib/ethosu/source_module.cc b/src/relay/backend/contrib/ethosu/source_module.cc new file mode 100644 index 0000000000000..61a880e17ffba --- /dev/null +++ b/src/relay/backend/contrib/ethosu/source_module.cc @@ -0,0 +1,320 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../../../../runtime/file_utils.h" + +namespace tvm { +namespace runtime { + +class EthosUModuleNode : public ModuleNode { + public: + /*! + * \brief The ethos runtime module. + * + * \param cmms A array of external symbol 1, serialized command stream 1 + * external symbol 2, serialized command stream 2, .... + * TODO : if and when FFI support Maps with non-objects OR compound arrays + * switch to that. + */ + explicit EthosUModuleNode(const String& func_name_, const String& cmms_hex_, + const String& weights_bias_hex_, const Integer& scratch_size_, + const Integer& input_size_, const Integer& output_size_) { + func_names_.push_back(func_name_); + cmms_hex = std::move(cmms_hex_); + weights_bias_hex = std::move(weights_bias_hex_); + scratch_size = scratch_size_->value; + input_size = input_size_->value; + output_size = output_size_->value; + c_source = GenerateSource(); + } + + /*! + * \brief Save the module to file. + * + * \param file_name The file to be saved to. + * \param format The format of the file. + */ + void SaveToFile(const std::string& file_name, const std::string& format) final { + std::string fmt = GetFileFormat(file_name, format); + LOG(INFO) << "format=" << fmt << ";;\n"; + ICHECK_EQ(fmt, "c") << "Can only save to format=" + << "c"; + std::ofstream out(file_name); + out << c_source; + out.close(); + } + + std::string GetSource(const std::string& format) final { return c_source; } + + std::string GetCS() { return cmms_hex; } + + /*! + * \brief Get a PackedFunc from the module. + * + * \param name The name of the function. + * \param sptr_to_self The ObjectPtr that points to this module node. + * + * \return The function pointer when it is found, otherwise, PackedFunc(nullptr). + */ + PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) final { + if (name == "get_func_names") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->func_names_; }); + } + return PackedFunc(); + } + + const char* type_key() const override { return "c"; } + + static Module Create(String func_name, String cmms_hex, String weights_bias_hex, + Integer scratch_size, Integer input_size, Integer output_size) { + auto n = make_object(func_name, cmms_hex, weights_bias_hex, scratch_size, + input_size, output_size); + return Module(n); + } + + private: + String c_source; + Array func_names_; + String cmms_hex; + String weights_bias_hex; + size_t scratch_size; + size_t input_size; + size_t output_size; + int indent_{0}; + + /*! + * \brief Convert the raw string of hex values into a hex string + * + * \param raw the raw string of hex values + * + * \return string formatted as a hex string + */ + std::string GetHexString(const std::string& raw) { + std::stringstream ss; + for (size_t i = 0; i < raw.size() / 2; ++i) { + ss << "\\x" << raw.substr(i * 2, 2); + } + return ss.str(); + } + + /*! + * \brief Emit code that updates the base_addrs array with the base address of the given array + * + * \param index array index for base_addrs and base_addrs_size + * \param name of the array containing relevant data + * + * \return string of code that updates the base_addrs array with the base address of the given + * array + */ + std::string SetBaseAddress(int index, std::string name) { + std::stringstream ss; + ss << " base_addrs[" << index << "] = (uintptr_t)(" << name << ");\n"; + ss << " base_addrs_size[" << index << "] = " << name << "_size;\n"; + return ss.str(); + } + + /*! + * \brief Enter a new scope. + */ + void EnterScope() { indent_ += 2; } + + /*! + * \brief Exit a scope. + */ + void ExitScope() { + ICHECK_GE(indent_, 2U) << "Wrong ident found."; + indent_ -= 2; + } + + /*! \brief Print indents using spaces. */ + void PrintIndents(std::stringstream& ss) { + for (int i = 0; i < indent_; i++) { + ss << ' '; + } + } + + /*! + * \brief Creates a runtime function header + */ + void PrintRuntimeFunctionHeader(std::stringstream& ss, std::string func_name) { + ss << "TVM_DLL int32_t "; + ss << func_name << "(void* input, void* output) {\n"; + } + + /*! + * \brief Creates a cplusplus guard prefix for extern "C" printing + */ + void PrintExternCPrefix(std::stringstream& ss) { + PrintIndents(ss); + ss << "#ifdef __cplusplus\n"; + ss << "extern \"C\" {\n"; + ss << "#endif\n"; + } + + /*! + * \brief Creates a cplusplus guard postfix for extern "C" printing + */ + void PrintExternCPostfix(std::stringstream& ss) { + PrintIndents(ss); + ss << "#ifdef __cplusplus\n"; + ss << "}\n"; + ss << "#endif\n"; + } + + /*! + * \brief Emit code that offloads a subgraph to the NPU + * + * \return string of code that offloads a subgraph to the NPU + */ + std::string GenerateSource() { + std::string func_no_dashes = func_names_[0]; + std::replace(func_no_dashes.begin(), func_no_dashes.end(), '-', '_'); + std::stringstream ss; + + ss << "#include \n"; + ss << "#include \n"; + ss << "#include \n"; + ss << "#include \n"; + ss << "#include \n"; + ss << "\n"; + size_t weights_size = (weights_bias_hex.size() / 2); + ss << "static const size_t weights_size = " << std::to_string(weights_size) << ";\n"; + ss << "static const size_t scratch_size = " << std::to_string(scratch_size) << ";\n"; + ss << "// Update linker script to place ethosu_scratch in memory that can be accessed by the " + "NPU\n"; + if (weights_size > 0) { + ss << "__attribute__((section(\"ethosu_scratch\"), aligned(16))) static int8_t weights[" + << weights_size << "] = \""; + ss << GetHexString(weights_bias_hex); + ss << "\";\n"; + } else { + ss << "static int8_t* weights = NULL;\n"; + } + ss << "__attribute__((section(\"ethosu_scratch\"), aligned(16))) static int8_t cms_data_data[" + << cmms_hex.size() / 2 << "] = \""; + ss << GetHexString(cmms_hex); + ss << "\";\n"; + ss << "static const size_t cms_data_size = sizeof(cms_data_data);\n"; + ss << "\n"; + + PrintExternCPrefix(ss); + ss << "static int32_t " << func_no_dashes + "_(int8_t* in0, " + << "size_t in0_size, int8_t* out0, size_t out0_size) {\n"; + ss << " int num_tensors = 5;\n"; + ss << " void* cms_data = (void*)(cms_data_data);\n"; + ss << " int64_t device_type = kDLCPU;\n"; + ss << " int64_t device_id = 0;\n"; + if (scratch_size > 0) { + ss << " int8_t* scratch = (int8_t*) TVMBackendAllocWorkspace(device_type, device_id, " + "(uint64_t)scratch_size, 0, 16);\n"; + } else { + ss << " int8_t* scratch = NULL;\n"; + } + ss << " size_t base_addrs_size[num_tensors];\n"; + ss << " uint64_t base_addrs[num_tensors];\n"; + ss << "\n"; + ss << SetBaseAddress(0, "weights"); + ss << SetBaseAddress(1, "scratch"); + ss << SetBaseAddress(2, "scratch"); + ss << SetBaseAddress(3, "in0"); + ss << SetBaseAddress(4, "out0"); + ss << "\n"; + ss << " struct ethosu_driver *drv = ethosu_reserve_driver();\n"; + ss << " int32_t result = ethosu_invoke(drv, cms_data, cms_data_size, base_addrs, " + "base_addrs_size, " + "num_tensors);\n"; + ss << " ethosu_release_driver(drv);\n"; + if (scratch_size > 0) { + ss << " TVMBackendFreeWorkspace(device_type, device_id, scratch);\n"; + } + ss << " if (result != 0) {\n"; + ss << " return -1;\n"; + ss << " } else {\n"; + ss << " return 0;\n"; + ss << " }\n"; + ss << "}\n"; + ss << "\n"; + PrintExternCPostfix(ss); + ss << "\n"; + PrintExternCPrefix(ss); + ss << "// Wrapper function is provided to allow for easier debugging\n"; + ss << "inline static int32_t " + func_no_dashes + "_wrapper_(void* input, void* output) {\n"; + ss << " size_t input_data_size = " << input_size << ";\n"; + ss << " size_t output_data_size = " << output_size << ";\n"; + ss << " return " + func_no_dashes + + "_((int8_t*)input, input_data_size, (int8_t*)output, output_data_size);\n"; + ss << "}\n"; + PrintExternCPostfix(ss); + ss << "\n"; + PrintExternCPrefix(ss); + PrintRuntimeFunctionHeader(ss, func_names_[0]); + EnterScope(); + PrintIndents(ss); + ss << "return " << func_no_dashes << "_wrapper_(input, output);\n"; + ExitScope(); + ss << "}\n"; + PrintExternCPostfix(ss); + + return ss.str(); + } +}; + +class EthosUModule : public Module { + public: + EthosUModule() {} + explicit EthosUModule(ObjectPtr n) : Module(n) {} + /*! \return internal container */ + inline EthosUModuleNode* operator->(); + /*! \return internal container */ + inline const EthosUModuleNode* operator->() const; +}; + +inline EthosUModuleNode* EthosUModule::operator->() { + return static_cast(get_mutable()); +} + +TVM_REGISTER_GLOBAL("runtime.module.ethosu.create").set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = EthosUModuleNode::Create(args[0], args[1], args[2], args[3], args[4], args[5]); +}); + +TVM_REGISTER_GLOBAL("runtime.module.ethosu.getcs").set_body_typed([](EthosUModule mod) { + return mod->GetCS(); +}); + +} // namespace runtime +} // namespace tvm diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py index fc795c066cb6b..a01907fd2330b 100644 --- a/tests/python/contrib/test_ethosu/infra.py +++ b/tests/python/contrib/test_ethosu/infra.py @@ -24,15 +24,31 @@ the command stream and perform an equivalency check for single operator test cases. """ +from typing import List +import os +import struct import numpy from enum import IntEnum +from ethosu.vela.register_command_stream_generator import CmdMode +from ethosu.vela.register_command_stream_generator import cmd0 +from ethosu.vela.register_command_stream_generator import cmd1 + import tvm from tvm import relay import tvm.relay.backend.contrib.ethosu.op as ethosu_ops from tvm.topi.nn.utils import get_pad_tuple +from tests.python.relay.aot.aot_test_utils import ( + AOTCompiledTestModel, + AOTDataLinkage, + AOTTestModel, + AOTTestRunner, + compile_models, + execute_models, +) + class AttachType(IntEnum): kGroupRoot = 1 @@ -42,6 +58,221 @@ class AttachType(IntEnum): kScanUpdate = 5 +class VelaArtifacts: + def __init__(self): + self.cs = dict() + self.flash = dict() + self.sram = dict() + self.npu_ops = set() + + +def parse_relay_tflite_model(tflite_model, input_tensor, input_shape, input_dtype): + mod_, params_ = relay.frontend.from_tflite( + tflite_model, + shape_dict={input_tensor: input_shape}, + dtype_dict={input_tensor: input_dtype}, + ) + return mod_, params_ + + +def parse_tflite_model(model_file): + try: + import tflite + + return tflite.Model.GetRootAsModel(model_file, 0) + except AttributeError: + import tflite.Model + + return tflite.Model.Model.GetRootAsModel(model_file, 0) + + +def print_payload(payload): + cmds = deserialize_command_stream(payload) + for cmd_val in cmds: + cmd, val = parse_cmd(cmd_val) + s = str(cmd) + s = s.ljust(40) + s += str(val) + print(s) + + +def parse_cmd(binary_cmd): + code = binary_cmd[0] & 0x0000FFFF # lower 16 bits + param = binary_cmd[0] >> 16 # higher 16 bits + payload_mode = CmdMode(code & CmdMode.Mask) + if payload_mode == CmdMode.Payload32: + command = cmd1(code & CmdMode.CmdOpMask) + value = binary_cmd[1] + else: + command = cmd0(code & CmdMode.CmdOpMask) + value = param + return command, value + + +def check_cmms_equivalency(vela_cmd, vela_value, tvm_value, ignore_cmds=None): + if ignore_cmds is None: + ignore_cmds = [] + if vela_value != tvm_value and vela_cmd not in ignore_cmds: + raise RuntimeError( + "ValueMismatch :: vela={}, tvm={} for command:{}".format( + vela_value, tvm_value, vela_cmd + ) + ) + + +def verify_cmms(cmms_tvm_blob, cmms_vela_blob): + vela_cmm = deserialize_command_stream(cmms_vela_blob) + tvm_cmm = deserialize_command_stream(cmms_tvm_blob) + cmms_zip = zip(vela_cmm, tvm_cmm) + + first_ifm_found = False + last_ofm_found = False + + ignore_commands = ( + cmd1.NPU_SET_DMA0_SRC, + cmd1.NPU_SET_DMA0_DST, + cmd1.NPU_SET_WEIGHT_BASE, + cmd1.NPU_SET_OFM_BASE0, + cmd1.NPU_SET_IFM_BASE0, + cmd1.NPU_SET_SCALE_BASE, + ) + + ofm_region_params = [] + ofm_bases = [] + for vela_cmm, tvm_cmm in cmms_zip: + vela_cmd, vela_value = parse_cmd(vela_cmm) + tvm_cmd, tvm_value = parse_cmd(tvm_cmm) + + assert vela_cmd == tvm_cmd + + # The first IFM region could be different, but it needs to be 1 and 3. + if vela_cmd == cmd0.NPU_SET_IFM_REGION and not first_ifm_found: + if vela_value == 1 and tvm_value == 3: + first_ifm_found = True + continue + + if vela_cmd == cmd1.NPU_SET_IFM_BASE0 and not first_ifm_found: + if tvm_value != 0: + raise RuntimeError("ValueError :: tvm primary ifm base should be zero") + continue + + # OFM regions should be cached to be checked later + if vela_cmd == cmd0.NPU_SET_OFM_REGION: + ofm_region_params.append((vela_value, tvm_value)) + continue + + # OFM bases should be cached to be checked later + if vela_cmd == cmd1.NPU_SET_OFM_BASE0: + ofm_bases.append((vela_value, tvm_value)) + continue + + check_cmms_equivalency(vela_cmd, vela_value, tvm_value, ignore_commands) + + # The last OFM region could be different but it should be 1 and 4. + last_vela_ofm_region, last_tvm_ofm_region = ofm_region_params.pop(-1) + if not (last_vela_ofm_region == 1 and last_tvm_ofm_region == 4): + raise RuntimeError( + "ValueMismatch :: vela={}, tvm={} for last ofm region it should be 1 and 4 respectively".format( + last_vela_ofm_region, last_tvm_ofm_region + ) + ) + + # The rest of the OFM regions should be the same. + for vela_value, tvm_value in ofm_region_params: + check_cmms_equivalency(vela_cmd, vela_value, tvm_value, ignore_commands) + + # The last OFM base should be zero for tvm + _, last_tvm_ofm_base = ofm_bases.pop(-1) + if not last_tvm_ofm_base == 0: + raise RuntimeError("ValueError :: tvm primary ofm base should be zero") + + +def deserialize_command_stream(blob): + assert isinstance(blob, bytes) + payload_bytes = struct.unpack("<{0}I".format(len(blob) // 4), blob) + cmms = [] + # remove_header + payload_bytes = payload_bytes[8:] + idx = 0 + while idx < len(payload_bytes): + cmd = [] + code = payload_bytes[idx] + idx += 1 + cmd.append(code) + payload_mode = CmdMode(code & CmdMode.Mask) + if payload_mode == CmdMode.Payload32: + value = payload_bytes[idx] + idx += 1 + cmd.append(value) + cmms.append(cmd) + return cmms + + +def _create_test_runner(accel): + file_dir = os.path.dirname(os.path.abspath(__file__)) + test_root = os.path.join(file_dir, "reference_system") + ethosu_macs = accel[accel.rfind("-") + 1 :] + return AOTTestRunner( + makefile="corstone300", + prologue=""" + uart_init(); + EthosuInit(); + """, + includes=["uart.h", "ethosu_55.h", "ethosu_mod.h", "hard_fault.h"], + parameters={"ETHOSU_TEST_ROOT": test_root, "NPU_VARIANT": ethosu_macs}, + pass_config={ + "relay.ext.ethosu.options": { + "accelerator_config": accel, + } + }, + ) + + +def build_source(module, inputs, outputs, accel="ethos-u55-256"): + interface_api = "c" + use_unpacked_api = True + test_runner = _create_test_runner(accel) + + return compile_models( + AOTTestModel( + module=module, + inputs=inputs, + outputs=outputs, + output_tolerance=10, + extra_memory_in_bytes=16 * 1024 * 1024, + ), + test_runner, + interface_api, + use_unpacked_api, + workspace_byte_alignment=16, + ) + + +def verify_source( + models: List[AOTCompiledTestModel], + accel="ethos-u55-256", +): + """ + This method verifies the generated source from an NPU module by building it and running on an FVP. + """ + interface_api = "c" + test_runner = _create_test_runner(accel) + execute_models( + models, + test_runner, + interface_api, + workspace_byte_alignment=16, + data_linkage=AOTDataLinkage(section="ethosu_scratch", alignment=16), + ) + + +def flatten_numpy_data(data): + """Flatten the numpy tensor to be single dimensional""" + total_elements = data.size + reshaped_data = numpy.reshape(data, [total_elements]) + return reshaped_data + + def generate_weights_data(shape, dtype): size = 1 for dim in shape: diff --git a/tests/python/contrib/test_ethosu/reference_system/arm-none-eabi-gcc.cmake b/tests/python/contrib/test_ethosu/reference_system/arm-none-eabi-gcc.cmake new file mode 100644 index 0000000000000..6aeb0b7cc7c15 --- /dev/null +++ b/tests/python/contrib/test_ethosu/reference_system/arm-none-eabi-gcc.cmake @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if (__TOOLCHAIN_LOADED) + return() +endif() +set(__TOOLCHAIN_LOADED TRUE) + +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_C_COMPILER "arm-none-eabi-gcc") +set(CMAKE_CXX_COMPILER "arm-none-eabi-g++") +set(CMAKE_SYSTEM_PROCESSOR "cortex-m55" CACHE STRING "Select Cortex-M architecture. (cortex-m0, cortex-m3, cortex-m33, cortex-m4, cortex-m55, cortex-m7, etc)") + +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) + +SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +set(CMAKE_C_STANDARD 99) +set(CMAKE_CXX_STANDARD 14) + +# The system processor could for example be set to cortex-m33+nodsp+nofp. +set(__CPU_COMPILE_TARGET ${CMAKE_SYSTEM_PROCESSOR}) +string(REPLACE "+" ";" __CPU_FEATURES ${__CPU_COMPILE_TARGET}) +list(POP_FRONT __CPU_FEATURES CMAKE_SYSTEM_PROCESSOR) + +string(FIND ${__CPU_COMPILE_TARGET} "+" __OFFSET) +if(__OFFSET GREATER_EQUAL 0) + string(SUBSTRING ${__CPU_COMPILE_TARGET} ${__OFFSET} -1 CPU_FEATURES) +endif() + +# Add -mcpu to the compile options to override the -mcpu the CMake toolchain adds +add_compile_options(-mcpu=${__CPU_COMPILE_TARGET}) + +# Set floating point unit +if("${__CPU_COMPILE_TARGET}" MATCHES "\\+fp") + set(FLOAT hard) +elseif("${__CPU_COMPILE_TARGET}" MATCHES "\\+nofp") + set(FLOAT soft) +elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "cortex-m33" OR + "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "cortex-m55") + set(FLOAT hard) +else() + set(FLOAT soft) +endif() + +add_compile_options(-mfloat-abi=${FLOAT}) +add_link_options(-mfloat-abi=${FLOAT}) + +# Link target +add_link_options(-mcpu=${__CPU_COMPILE_TARGET}) +add_link_options(-Xlinker -Map=output.map) + +# +# Compile options +# +set(cxx_flags "-fno-unwind-tables;-fno-rtti;-fno-exceptions") + +add_compile_options("-Wall;-Wextra;-Wsign-compare;-Wunused;-Wswitch-default;\ +-Wdouble-promotion;-Wredundant-decls;-Wshadow;-Wnull-dereference;\ +-Wno-format-extra-args;-Wno-unused-function;-Wno-unused-label;\ +-Wno-missing-field-initializers;-Wno-return-type;-Wno-format;-Wno-int-conversion" + "$<$:${cxx_flags}>" + ) diff --git a/tests/python/contrib/test_ethosu/reference_system/ethosu_55.h b/tests/python/contrib/test_ethosu/reference_system/ethosu_55.h new file mode 100644 index 0000000000000..41ce284956e2e --- /dev/null +++ b/tests/python/contrib/test_ethosu/reference_system/ethosu_55.h @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_RUNTIME_CONTRIB_ETHOS_U_ETHOSU_55_H_ +#define TVM_RUNTIME_CONTRIB_ETHOS_U_ETHOSU_55_H_ + +/* Define Arm(R) Ethos(TM)-U55 specific IRQs & base address */ +#define ETHOSU_NPU_FAIL (1 << 4) +#define ETHOSU_IRQ ((IRQn_Type)56) +#define ETHOSU_BASE_ADDRESS ((void*)0x48102000) + +#endif // TVM_RUNTIME_CONTRIB_ETHOS_U_ETHOSU_55_H_ diff --git a/tests/python/contrib/test_ethosu/reference_system/ethosu_mod.h b/tests/python/contrib/test_ethosu/reference_system/ethosu_mod.h new file mode 100644 index 0000000000000..aa5c1026bd6d8 --- /dev/null +++ b/tests/python/contrib/test_ethosu/reference_system/ethosu_mod.h @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_RUNTIME_CONTRIB_ETHOS_U_ETHOSU_MOD_H_ +#define TVM_RUNTIME_CONTRIB_ETHOS_U_ETHOSU_MOD_H_ + +#include +// TODO(@grant-arm): Remove device specific information once RTOS support is available +#include +#include + +#include "ethosu_55.h" + +struct ethosu_driver* ethosu0_driver = ðosu_drv; + +void ethosuIrqHandler0() { ethosu_irq_handler(ethosu0_driver); } + +// Initialize Arm(R) Ethos(TM)-U NPU driver +int EthosuInit() { + if (ethosu_init(ethosu0_driver, (void*)ETHOSU_BASE_ADDRESS, NULL, 0, 1, 1)) { + printf("Failed to initialize NPU.\n"); + return -1; + } + + // Display Arm(R) Ethos(TM)-U version information useful for debugging issues + struct ethosu_version version; + ethosu_get_version(ethosu0_driver, &version); + printf( + "version={major=%u, minor=%u, status=%u}, product={major=%u}, arch={major=%u, minor=%u, " + "patch=%u}\n", + version.id.version_major, version.id.version_minor, version.id.version_status, + version.id.product_major, version.id.arch_major_rev, version.id.arch_minor_rev, + version.id.arch_patch_rev); + printf("macs_per_cc=%u, cmd_stream_version=%u, shram_size=%u\n", version.cfg.macs_per_cc, + version.cfg.cmd_stream_version, version.cfg.shram_size); + + // Assumes SCB->VTOR points to RW memory + NVIC_SetVector(ETHOSU_IRQ, (uint32_t)ðosuIrqHandler0); + NVIC_EnableIRQ(ETHOSU_IRQ); + + return 0; +} + +#endif // TVM_RUNTIME_CONTRIB_ETHOS_U_ETHOSU_MOD_H_ diff --git a/tests/python/contrib/test_ethosu/reference_system/hard_fault.h b/tests/python/contrib/test_ethosu/reference_system/hard_fault.h new file mode 100644 index 0000000000000..9d349004848be --- /dev/null +++ b/tests/python/contrib/test_ethosu/reference_system/hard_fault.h @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_RUNTIME_CONTRIB_ETHOS_U_HARD_FAULT_H_ +#define TVM_RUNTIME_CONTRIB_ETHOS_U_HARD_FAULT_H_ + +struct ExcContext { + uint32_t r0; + uint32_t r1; + uint32_t r2; + uint32_t r3; + uint32_t r12; + uint32_t lr; + uint32_t pc; + uint32_t xPsr; +}; +void HardFault_Handler() { + int irq; + struct ExcContext* e; + uint32_t sp; + asm volatile( + "mrs %0, ipsr \n" // Read IPSR (Exception number) + "sub %0, #16 \n" // Get it into IRQn_Type range + "tst lr, #4 \n" // Select the stack which was in use + "ite eq \n" + "mrseq %1, msp \n" + "mrsne %1, psp \n" + "mov %2, sp \n" + : "=r"(irq), "=r"(e), "=r"(sp)); + printf("Hard fault. irq=%d, pc=0x%08lu, lr=0x%08lu, xpsr=0x%08lu, sp=0x%08lu\n", irq, e->pc, + e->lr, e->xPsr, sp); + printf("%11s cfsr=0x%08lu bfar=0x%08lu\n", "", SCB->CFSR, SCB->BFAR); + printf("EXITTHESIM\n"); + while (1 == 1) + ; +} + +#endif // TVM_RUNTIME_CONTRIB_ETHOS_U_HARD_FAULT_H_ diff --git a/tests/python/contrib/test_ethosu/test_codegen.py b/tests/python/contrib/test_ethosu/test_codegen.py new file mode 100644 index 0000000000000..e88d6a37c291f --- /dev/null +++ b/tests/python/contrib/test_ethosu/test_codegen.py @@ -0,0 +1,174 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-argument +import pytest + +pytest.importorskip("ethosu.vela") +import os +import numpy as np +import pathlib + +import tvm +import tvm.micro as micro +from tvm import relay +from tvm.relay.backend.contrib import ethosu +from tvm.relay.backend.contrib.ethosu import util +from tests.python.relay.aot.aot_test_utils import generate_ref_data + +import relay_ir_builder +import infra + +ACCEL_TYPES = ["ethos-u55-256", "ethos-u55-128", "ethos-u55-64", "ethos-u55-32"] + + +def infer_type_function_pass(func): + mod = tvm.IRModule() + mod["test"] = func + mod = relay.transform.InferType()(mod) + return mod["test"] + + +def get_shape_expr(in_expr, out_expr): + main_f = relay.Function([in_expr], out_expr) + main_f = infer_type_function_pass(main_f) + shape = [int(i) for i in main_f.body.checked_type.shape] + return shape + + +@pytest.mark.parametrize( + "accel_type", + ACCEL_TYPES, +) +def test_ethosu_conv2d(accel_type): + def create_graph_single(input_tensor_name, input_tensor_shape, input_tensor_dtype): + c1_params = relay_ir_builder.QnnConv2DParams(input_tensor_dtype) + c1_params.ifm.shape = input_tensor_shape + c1_params.kernel.shape = (3, 3, c1_params.ifm.shape[3], 32) + c1_params.kernel.sc = relay.const(np.random.rand(32) * 2, "float32") + c1_params.strides = (1, 1) + c1_params.pad = "VALID" + c1_params.update_output_qnn_params( + input_tensor_dtype, input_tensor_dtype, input_tensor_dtype + ) + input0 = relay.var(input_tensor_name, shape=c1_params.ifm.shape, dtype=c1_params.ifm.dtype) + c1, new_params = relay_ir_builder.create_qnn_conv2d(c1_params, input0) + c1_params.ofm.shape = get_shape_expr(input0, c1) + + f = relay.Function([input0], c1) + mod = tvm.IRModule() + mod["main"] = f + return mod, [c1_params] + + def create_graph_double(input_tensor_name, input_tensor_shape, input_tensor_dtype): + c1_params = relay_ir_builder.QnnConv2DParams(input_tensor_dtype) + c1_params.ifm.shape = input_tensor_shape + c1_params.kernel.shape = (7, 7, c1_params.ifm.shape[3], 8) + c1_params.strides = (2, 2) + c1_params.pad = "VALID" + c1_params.update_output_qnn_params( + input_tensor_dtype, input_tensor_dtype, input_tensor_dtype + ) + input0 = relay.var(input_tensor_name, shape=c1_params.ifm.shape, dtype=c1_params.ifm.dtype) + c1, new_params = relay_ir_builder.create_qnn_conv2d(c1_params, input0) + c1_params.ofm.shape = get_shape_expr(input0, c1) + + c2_params = relay_ir_builder.QnnConv2DParams(input_tensor_dtype) + c2_params.ifm.shape = c1_params.ofm.shape + c2_params.kernel.shape = (5, 5, c2_params.ifm.shape[3], 16) + c2_params.strides = (1, 1) + c2_params.pad = "SAME" + c2_params.update_output_qnn_params() + c2, new_params = relay_ir_builder.create_qnn_conv2d(c2_params, c1) + c2_params.ofm.shape = get_shape_expr(input0, c2) + + f = relay.Function([input0], c2) + mod = tvm.IRModule() + mod["main"] = f + return mod, [c2_params, c1_params] + + def create_graph_activation(input_tensor_name, input_tensor_shape, input_tensor_dtype): + c1_params = relay_ir_builder.QnnConv2DParams(input_tensor_dtype) + c1_params.ifm.shape = input_tensor_shape + c1_params.kernel.shape = (7, 7, c1_params.ifm.shape[3], 8) + c1_params.strides = (2, 2) + c1_params.pad = "VALID" + c1_params.activation = "CLIP" + c1_params.clip_min = 90 + c1_params.clip_max = 110 + c1_params.update_output_qnn_params( + input_tensor_dtype, input_tensor_dtype, input_tensor_dtype + ) + input0 = relay.var(input_tensor_name, shape=c1_params.ifm.shape, dtype=c1_params.ifm.dtype) + c1, new_params = relay_ir_builder.create_qnn_conv2d(c1_params, input0) + c1_params.ofm.shape = get_shape_expr(input0, c1) + + c2_params = relay_ir_builder.QnnConv2DParams(input_tensor_dtype) + c2_params.ifm.shape = c1_params.ofm.shape + c2_params.kernel.shape = (5, 5, c2_params.ifm.shape[3], 16) + c2_params.strides = (1, 1) + c2_params.pad = "SAME" + c2_params.update_output_qnn_params() + c2, new_params = relay_ir_builder.create_qnn_conv2d(c2_params, c1) + c2_params.ofm.shape = get_shape_expr(input0, c2) + + f = relay.Function([input0], c2) + mod = tvm.IRModule() + mod["main"] = f + return mod, [c2_params, c1_params] + + test_cases = [ + (create_graph_single, ["input", (1, 300, 300, 3), "int8"]), + (create_graph_double, ["input", (1, 128, 256, 4), "int8"]), + (create_graph_activation, ["input", (1, 64, 100, 4), "int8"]), + ] + np.random.seed(42) + for test_case in test_cases: + relay_module, conv_params = test_case[0](*test_case[1]) + input_tensor, input_shape, input_dtype = test_case[1] + mod = ethosu.partition_for_ethosu(relay_module) + + # Generate reference data + in_min, in_max = util.get_range_for_dtype_str(input_dtype) + input_data = { + input_tensor: np.random.randint( + in_min, high=in_max, size=input_shape, dtype=input_dtype + ) + } + output_data = generate_ref_data(relay_module, input_data) + + compiled_models = infra.build_source( + mod, + input_data, + output_data, + accel_type, + ) + + # Assumes only two runtime.Modules are created -- i.e. single offload module + imported_modules = compiled_models[0].module.lib.imported_modules + assert len(imported_modules) == 2 + ethosu_module = imported_modules[0] + + # Verify generated C source + get_cs = tvm._ffi.get_global_func("runtime.module.ethosu.getcs") + cmms = get_cs(ethosu_module) + cmms = bytes.fromhex(cmms) + infra.print_payload(cmms) + infra.verify_source(compiled_models, accel_type) + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py new file mode 100644 index 0000000000000..76b8512d558d3 --- /dev/null +++ b/tests/python/contrib/test_ethosu/test_networks.py @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-argument +import pytest + +pytest.importorskip("ethosu.vela") +from tests.python.relay.aot.aot_test_utils import ( + convert_to_relay, + generate_ref_data, +) +import numpy as np + +import tvm +import tvm.micro as micro +from tvm import relay +from tvm.relay.backend.contrib import ethosu +from tvm.relay.backend.contrib.ethosu import util +import tvm.relay.testing.tf as tf_testing + +import infra + +ACCEL_TYPES = ["ethos-u55-256", "ethos-u55-128", "ethos-u55-64", "ethos-u55-32"] + + +def test_forward_mobilenet_v1(accel_type="ethos-u55-256"): + """Test the Mobilenet V1 TF Lite model.""" + np.random.seed(23) + tflite_model_file = tf_testing.get_workload_official( + "https://storage.googleapis.com/download.tensorflow.org/" + "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", + "mobilenet_v1_1.0_224_quant.tflite", + ) + with open(tflite_model_file, "rb") as f: + tflite_model_buf = f.read() + input_tensor = "input" + input_dtype = "uint8" + input_shape = (1, 224, 224, 3) + in_min, in_max = util.get_range_for_dtype_str(input_dtype) + input_data = np.random.randint(in_min, high=in_max, size=input_shape, dtype=input_dtype) + + relay_mod, params = convert_to_relay(tflite_model_buf, input_data, "input") + input_data = {input_tensor: input_data} + output_data = generate_ref_data(relay_mod, input_data) + + mod = ethosu.partition_for_ethosu(relay_mod, params) + compiled_models = infra.build_source(mod, input_data, output_data, accel_type) + infra.verify_source(compiled_models, accel_type) + + +if __name__ == "__main__": + test_forward_mobilenet_v1() diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index 490257ac66da1..1b021fd738fe2 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -26,7 +26,7 @@ import shutil import subprocess import tarfile -from typing import NamedTuple, Union, Optional, List, Dict +from typing import Any, NamedTuple, Union, Optional, List, Dict import pytest import numpy as np @@ -56,17 +56,53 @@ class AOTTestModel(NamedTuple): Dict of input names to value arrays outputs: List[np.array] Ordered list of output value arrays + output_tolerance: Optional[Union[int, float]] + Allowed tolerance of the output name: str Name to use for this model params: Optional[Dict[str, np.array]] Dict of parameter names to value arrays + extra_memory_in_bytes: int + Extra memory to allocate after planned memory """ module: tvm.IRModule inputs: Dict[str, np.array] outputs: List[np.array] + output_tolerance: Optional[Union[int, float]] = None name: str = "default" params: Optional[Dict[str, np.array]] = None + extra_memory_in_bytes: int = 0 + + +class AOTCompiledTestModel(NamedTuple): + """A compiled AOTTestModel with associated module + + Parameters + ---------- + model: AOTTestModel + Input model to be compiled + module: tvm.runtime.Module + The compiled Module for the associated AOTTestModel + """ + + model: AOTTestModel + module: tvm.runtime.Module + + +class AOTDataLinkage(NamedTuple): + """A compiled AOTTestModel with associated module + + Parameters + ---------- + section: str + Named section to place data into + alignment: int + Section alignment + """ + + section: str + alignment: int class AOTTestRunner(NamedTuple): @@ -80,14 +116,17 @@ class AOTTestRunner(NamedTuple): Code to prepend to the main function includes: List[str] Additional includes required to run the AOT test runner - parameters: Map[str, str] + parameters: Dict[str, str] Additional parameters to pass to the make command + pass_config: Dict[str, Any] + Additional pass configuration when building the model """ makefile: str = "default" prologue: str = "" includes: List[str] = [] parameters: Dict[str, str] = {} + pass_config: Dict[str, Any] = {} AOT_DEFAULT_RUNNER = AOTTestRunner() @@ -225,11 +264,20 @@ def subprocess_log_output(cmd, cwd, logfile): return proc.wait() -def emit_main_prologue(main_file, custom_prologue, workspace_bytes): +# TODO: Move to linker script with list of symbols rather than coding into source +def emit_data_linkage(output_file, data_linkage): + if data_linkage is not None: + output_file.write( + f'__attribute__((section("{data_linkage.section}"), aligned({data_linkage.alignment}))) ' + ) + + +def emit_main_prologue(main_file, custom_prologue, workspace_bytes, data_linkage): # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment. main_file.write( f"#define WORKSPACE_SIZE ({workspace_bytes} + TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES)\n" ) + emit_data_linkage(main_file, data_linkage) main_file.write("static uint8_t g_aot_memory[WORKSPACE_SIZE];\n") main_file.write("tvm_workspace_t app_workspace;\n") main_file.write( @@ -242,9 +290,14 @@ def emit_main_prologue(main_file, custom_prologue, workspace_bytes): return StackMemoryManager_Free(&app_workspace,ptr); } -void TVMPlatformAbort(tvm_crt_error_t code) { } +void TVMPlatformAbort(tvm_crt_error_t code) { exit(-1); } -void TVMLogf(const char* msg, ...) { } +void TVMLogf(const char* msg, ...) { + va_list args; + va_start(args, msg); + vfprintf(stdout, msg, args); + va_end(args); +} TVM_DLL int TVMFuncRegisterGlobal(const char* name, TVMFunctionHandle f, int override) {} int main(){\n @@ -360,23 +413,30 @@ def fake_tensor(source, source_index, packed_index): main_file.write("\n") -def emit_main_compare(main_file, output_list, mod_name): +def emit_main_compare(main_file, output_list, output_tolerance, mod_name): num_outputs = len(output_list) actual_data_name = mangle_name(mod_name, "output_data") expected_data_name = mangle_name(mod_name, "expected_output_data") for i in range(0, num_outputs): is_float_dtype = output_list[i].dtype == "float32" - main_file.write(f"for (int i = 0; i<{actual_data_name}{i}_len; i++){{\n") + + comparison_function = "abs" + tolerance = output_tolerance or 0 if is_float_dtype: - main_file.write( - f'if (fabs({actual_data_name}{i}[i]-{expected_data_name}{i}[i]) > 0.001f){{\n\tprintf("{AOT_FAILURE_TOKEN}\\n");\n\treturn -1;}}\n' - ) - else: - main_file.write( - f'if ({actual_data_name}{i}[i]!={expected_data_name}{i}[i]){{\n\tprintf("{AOT_FAILURE_TOKEN}\\n");\n\treturn -1;}}\n' - ) - main_file.write("}\n") + comparison_function = "fabs" + tolerance = output_tolerance or 0.001 + + main_file.write( + f""" + for (int i = 0; i<{actual_data_name}{i}_len; i++) {{ + if ({comparison_function}({actual_data_name}{i}[i]-{expected_data_name}{i}[i]) > {tolerance}) {{ + printf("{AOT_FAILURE_TOKEN}\\n"); + return -1; + }} + }} + """ + ) def emit_main_init_memory_manager(main_file): @@ -392,6 +452,8 @@ def emit_main_epilogue(main_file): def emit_main_common_includes(main_file, custom_includes): main_file.write("#include \n") + main_file.write("#include \n") + main_file.write("#include \n") main_file.write("#include \n") main_file.write('#include "tvm/runtime/c_runtime_api.h"\n') main_file.write('#include "tvm/runtime/crt/stack_allocator.h"\n') @@ -404,7 +466,14 @@ def emit_main_micro_include(main_file, mod_name): def create_main( - test_name, models, output_path, custom_includes, custom_prologue, interface_api, workspace_bytes + test_name, + models, + output_path, + custom_includes, + custom_prologue, + data_linkage, + interface_api, + workspace_bytes, ): file_path = pathlib.Path(f"{output_path}/" + test_name).resolve() # create header file @@ -418,7 +487,7 @@ def create_main( for model in models: emit_main_data(main_file, model.inputs, model.outputs, model.name) - emit_main_prologue(main_file, custom_prologue, workspace_bytes) + emit_main_prologue(main_file, custom_prologue, workspace_bytes, data_linkage) emit_main_init_memory_manager(main_file) if interface_api == "c": @@ -432,11 +501,11 @@ def create_main( emit_main_packed_call(main_file, model.inputs, model.outputs, model.name) for model in models: - emit_main_compare(main_file, model.outputs, model.name) + emit_main_compare(main_file, model.outputs, model.output_tolerance, model.name) emit_main_epilogue(main_file) -def create_header_file(tensor_name, npy_data, output_path): +def create_header_file(tensor_name, npy_data, output_path, data_linkage): """ This method generates a header file containing the data contained in the numpy array provided. It is used to capture the tensor data (for both inputs and expected outputs) to be bundled into the standalone application. @@ -450,6 +519,8 @@ def create_header_file(tensor_name, npy_data, output_path): header_file.write("#include \n") header_file.write(f"const size_t {tensor_name}_len = {npy_data.size};\n") + emit_data_linkage(header_file, data_linkage) + if npy_data.dtype == "int8": header_file.write(f"int8_t {tensor_name}[] =") elif npy_data.dtype == "int32": @@ -471,34 +542,54 @@ def extract_main_workspace_size_bytes(extract_dir): return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"] -def compile_and_run( +def compile_models( models: Union[List[AOTTestModel], AOTTestModel], runner: AOTTestRunner, interface_api, use_unpacked_api, - debug_calculated_workspaces=False, workspace_byte_alignment=8, enable_op_fusion=True, ): - """ - This method verifies the generated source - """ + if not isinstance(models, list): + models = [models] + base_target = "c -runtime=c --link-params --executor=aot" extra_target = f"--workspace-byte-alignment={workspace_byte_alignment} --interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}" target = f"{base_target} {extra_target}" - cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} " - if not isinstance(models, list): - models = [models] + config = {"tir.disable_vectorize": True, **runner.pass_config} + if not enable_op_fusion: + config["relay.FuseOps.max_depth"] = 1 + with tvm.transform.PassContext(opt_level=3, config=config): + return [ + AOTCompiledTestModel( + model=model, + module=tvm.relay.build( + model.module, + target, + target_host=target, + params=model.params, + mod_name=model.name, + ), + ) + for model in models + ] + + +def execute_models( + compiled_models: List[AOTCompiledTestModel], + runner, + interface_api, + debug_calculated_workspaces=False, + workspace_byte_alignment=8, + data_linkage=None, +): + cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} " # The calculated workspaces will not account for stack allocator tags used for debugging if debug_calculated_workspaces: cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK " - config = {"tir.disable_vectorize": True} - if not enable_op_fusion: - config["relay.FuseOps.max_depth"] = 1 - tmp_path = utils.tempdir() tmp_dir = tmp_path.temp_dir @@ -515,21 +606,14 @@ def compile_and_run( ) workspace_bytes = 0 - for model in models: - with tvm.transform.PassContext(opt_level=3, config=config): - lib = tvm.relay.build( - model.module, - target, - target_host=target, - params=model.params, - mod_name=model.name, - ) - + for compiled_model in compiled_models: + model = compiled_model.model tar_file = os.path.join(base_path, f"{model.name}.tar") - export_model_library_format(lib, tar_file) + export_model_library_format(compiled_model.module, tar_file) t = tarfile.open(tar_file) t.extractall(base_path) + workspace_bytes += model.extra_memory_in_bytes workspace_bytes += extract_main_workspace_size_bytes(base_path) for key in model.inputs: @@ -538,6 +622,7 @@ def compile_and_run( f'{mangle_name(model.name, "input_data")}_{sanitized_tensor_name}', model.inputs[key], include_path, + data_linkage, ) for i in range(len(model.outputs)): @@ -545,19 +630,22 @@ def compile_and_run( (f'{mangle_name(model.name,"output_data")}{i}'), np.zeros(model.outputs[i].shape, model.outputs[i].dtype), include_path, + data_linkage, ) create_header_file( (f'{mangle_name(model.name, "expected_output_data")}{i}'), model.outputs[i], include_path, + data_linkage, ) create_main( "test.c", - models, + [compiled_model.model for compiled_model in compiled_models], build_path, runner.includes, runner.prologue, + data_linkage, interface_api, workspace_bytes, ) @@ -592,6 +680,32 @@ def compile_and_run( assert AOT_SUCCESS_TOKEN in run_log.read() +def compile_and_run( + models: Union[List[AOTTestModel], AOTTestModel], + runner: AOTTestRunner, + interface_api, + use_unpacked_api, + debug_calculated_workspaces=False, + workspace_byte_alignment=8, + enable_op_fusion=True, +): + """ + This method verifies the generated source + """ + + compiled_models = compile_models( + models, runner, interface_api, use_unpacked_api, workspace_byte_alignment, enable_op_fusion + ) + + execute_models( + compiled_models, + runner, + interface_api, + debug_calculated_workspaces, + workspace_byte_alignment, + ) + + def generate_ref_data(mod, input_data, params=None, target="llvm"): """Generate reference data through executing the relay module""" compile_engine.get().clear() diff --git a/tests/python/relay/aot/corstone300.ld b/tests/python/relay/aot/corstone300.ld index 4a6b22480d9f8..9534b869f6e6c 100644 --- a/tests/python/relay/aot/corstone300.ld +++ b/tests/python/relay/aot/corstone300.ld @@ -257,6 +257,14 @@ SECTIONS __bss_end__ = .; } > DTCM AT > DTCM + .ddr : + { + . = ALIGN(4); + . = ALIGN(16); + *(ethosu_scratch) + . = ALIGN (16); + } > DDR + .data_sram : { . = ALIGN(16); diff --git a/tests/python/relay/aot/corstone300.mk b/tests/python/relay/aot/corstone300.mk index 3a946f2cd8768..8d03ccc5b5f40 100644 --- a/tests/python/relay/aot/corstone300.mk +++ b/tests/python/relay/aot/corstone300.mk @@ -28,9 +28,11 @@ endif ARM_CPU=ARMCM55 DMLC_CORE=${TVM_ROOT}/3rdparty/dmlc-core ETHOSU_PATH=/opt/arm/ethosu +DRIVER_PATH=${ETHOSU_PATH}/core_driver CMSIS_PATH=${ETHOSU_PATH}/cmsis PLATFORM_PATH=${ETHOSU_PATH}/core_platform/targets/corstone-300 PKG_COMPILE_OPTS = -g -Wall -O2 -Wno-incompatible-pointer-types -Wno-format -mcpu=cortex-m55 -mthumb -mfloat-abi=hard -std=gnu99 +CMAKE = /opt/arm/cmake/bin/cmake CC = arm-none-eabi-gcc AR = arm-none-eabi-ar RANLIB = arm-none-eabi-ranlib @@ -40,11 +42,15 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \ -I$(build_dir)/../include \ -I$(CODEGEN_ROOT)/host/include \ -I${PLATFORM_PATH} \ + -I${DRIVER_PATH}/include \ -I${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Include/ \ -I${CMSIS_PATH}/CMSIS/Core/Include \ -I${CMSIS_PATH}/CMSIS/NN/Include \ -I${CMSIS_PATH}/CMSIS/DSP/Include \ - -isystem$(STANDALONE_CRT_DIR)/include \ + -isystem$(STANDALONE_CRT_DIR)/include +DRIVER_CMAKE_FLAGS = -DCMAKE_TOOLCHAIN_FILE=$(ETHOSU_TEST_ROOT)/arm-none-eabi-gcc.cmake \ + -DETHOSU_LOG_SEVERITY=debug \ + -DCMAKE_SYSTEM_PROCESSOR=cortex-m55 PKG_LDFLAGS = -lm -specs=nosys.specs -static -T ${AOT_TEST_ROOT}/corstone300.ld @@ -61,6 +67,11 @@ CMSIS_STARTUP_SRCS = $(shell find ${CMSIS_PATH}/Device/ARM/${ARM_CPU}/Source/*.c CMSIS_NN_SRCS = $(shell find ${CMSIS_PATH}/CMSIS/NN/Source/*/*.c) UART_SRCS = $(shell find ${PLATFORM_PATH}/*.c) +ifdef ETHOSU_TEST_ROOT +ETHOSU_ARCHIVE=${build_dir}/ethosu_core_driver/libethosu_core_driver.a +ETHOSU_INCLUDE=-I$(ETHOSU_TEST_ROOT) +endif + aot_test_runner: $(build_dir)/aot_test_runner $(build_dir)/stack_allocator.o: $(TVM_ROOT)/src/runtime/crt/memory/stack_allocator.c @@ -94,9 +105,14 @@ ${build_dir}/libuart.a: $(UART_SRCS) $(QUIET)$(AR) -cr $(abspath $(build_dir)/libuart.a) $(abspath $(build_dir))/libuart/*.o $(QUIET)$(RANLIB) $(abspath $(build_dir)/libuart.a) -$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/crt_backend_api.o $(build_dir)/stack_allocator.o ${build_dir}/libcmsis_startup.a ${build_dir}/libcmsis_nn.a ${build_dir}/libuart.a $(build_dir)/libcodegen.a +${build_dir}/ethosu_core_driver/libethosu_core_driver.a: + $(QUIET)mkdir -p $(@D) + $(QUIET)cd $(DRIVER_PATH) && $(CMAKE) -B $(abspath $(build_dir)/ethosu_core_driver) $(DRIVER_CMAKE_FLAGS) + $(QUIET)cd $(abspath $(build_dir)/ethosu_core_driver) && $(MAKE) + +$(build_dir)/aot_test_runner: $(build_dir)/test.c $(build_dir)/crt_backend_api.o $(build_dir)/stack_allocator.o ${build_dir}/libcmsis_startup.a ${build_dir}/libcmsis_nn.a ${build_dir}/libuart.a $(build_dir)/libcodegen.a $(ETHOSU_ARCHIVE) $(QUIET)mkdir -p $(@D) - $(QUIET)$(CC) $(PKG_CFLAGS) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS) + $(QUIET)$(CC) $(PKG_CFLAGS) $(ETHOSU_INCLUDE) -o $@ -Wl,--whole-archive $^ -Wl,--no-whole-archive $(PKG_LDFLAGS) clean: $(QUIET)rm -rf $(build_dir)/crt @@ -109,6 +125,7 @@ run: $(build_dir)/aot_test_runner -C cpu0.CFGITCMSZ=15 -C mps3_board.uart0.out_file=\"-\" -C mps3_board.uart0.shutdown_tag=\"EXITTHESIM\" \ -C mps3_board.visualisation.disable-visualisation=1 -C mps3_board.telnetterminal0.start_telnet=0 \ -C mps3_board.telnetterminal1.start_telnet=0 -C mps3_board.telnetterminal2.start_telnet=0 -C mps3_board.telnetterminal5.start_telnet=0 \ + -C ethosu.extra_args="--fast" \ -C ethosu.num_macs=$(NPU_VARIANT) $(build_dir)/aot_test_runner .SUFFIXES: diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 68a9b0b436e76..4d15fa8703b17 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -297,13 +297,22 @@ def test_mobilenet(debug_calculated_workspaces, workspace_byte_alignment): interface_api = "c" test_runner = AOT_DEFAULT_RUNNER + # TODO(@Mousius) - Enable memory planning to take into account debug information + debugging_memory_overhead = 1024 * 1024 + mod, params = testing.mobilenet.get_workload(batch_size=1) data_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape] data = np.random.uniform(size=data_shape).astype("float32") inputs = {"data": data} output_list = generate_ref_data(mod, inputs, params) compile_and_run( - AOTTestModel(module=mod, inputs=inputs, outputs=output_list, params=params), + AOTTestModel( + module=mod, + inputs=inputs, + outputs=output_list, + params=params, + extra_memory_in_bytes=debugging_memory_overhead, + ), test_runner, interface_api, use_unpacked_api,