From 836648eeead9f88efc41b714862aec97f84e4805 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Thu, 6 May 2021 11:06:48 +0100 Subject: [PATCH] Decoupling AOT from graph memory planner In this PR we are decoupling AOT from the Graph Memory Planner. Since AOT has the runner expressed in TIR we can get rid of the GMP in relay and use the Storage Rewrite Pass to do memory planning on the runner function. This also sorts out the issue mentioned in #8062 Change-Id: I6e33fadbf0462edf0366ee37e84ffde26123d3cb --- src/relay/backend/aot_executor_codegen.cc | 252 +++++++++++++++++----- tests/python/relay/aot/aot_test_utils.py | 40 ++++ tests/python/relay/aot/test_crt_aot.py | 22 ++ 3 files changed, 258 insertions(+), 56 deletions(-) diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc index 573206c782f6b..b4578c93f501d 100644 --- a/src/relay/backend/aot_executor_codegen.cc +++ b/src/relay/backend/aot_executor_codegen.cc @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -44,52 +45,179 @@ namespace tvm { namespace relay { namespace backend { +/** + * Struct to contain information about intermediate variables in the + * runner function + */ +struct StorageInfo { + /*! \brief unique integer identifier of the particular intermediate variable */ + std::vector ids; + /*! \brief exact size of the temporary */ + std::vector sizes_bytes; + /*! \brief device type of the temporary variable */ + std::vector dev_types; +}; + using IntegerArray = Array; using TargetsMap = std::unordered_map; +using StorageMap = + std::unordered_map; -class AotReturnSidVisitor : public ExprVisitor { +/** + * This is an on demand allocator for AOT. A new temporary + * (storage allocator identifier) is allocated for each operation. + */ +class AOTOnDemandAllocator : public ExprVisitor { public: - explicit AotReturnSidVisitor(Map> storage_device_map) - : storage_device_map_{storage_device_map}, return_sid_{-1} {} + // run the visitor on a function. + void Run(const Function& func) { + node_device_map_ = CollectDeviceInfo(func); - IntegerArray FindReturnSid(Function func) { - VisitExpr(func->body); - return return_sid_; + for (Expr param : func->params) { + CreateSid(param.operator->()); + } + + GetSid(func->body); } - protected: - void AssignReturnSid(Expr e) { - auto iter = storage_device_map_.find(e); - if (iter != storage_device_map_.end()) { - return_sid_ = (*iter).second[0]; + std::vector GetReturnIds() const { return return_ids_; } + + StorageMap GetStorageMap() const { return storage_device_map_; } + + void VisitExpr_(const ConstantNode* op) final { + CreateSid(op); + AssignReturnSid(GetRef(op)); + } + + void VisitExpr_(const CallNode* op) final { + // create token for the call node. + CreateSid(op); + for (Expr arg : op->args) { + GetSid(arg); } + AssignReturnSid(GetRef(op)); } - void VisitExpr_(const ConstantNode* cn) override { - ExprVisitor::VisitExpr_(cn); - AssignReturnSid(GetRef(cn)); + void VisitExpr_(const VarNode* op) final { + ExprVisitor::VisitExpr_(op); + AssignReturnSid(GetRef(op)); } - void VisitExpr_(const VarNode* vn) override { - ExprVisitor::VisitExpr_(vn); - AssignReturnSid(GetRef(vn)); + void VisitExpr_(const FunctionNode* op) final { + // do not recurse into sub function. } - void VisitExpr_(const CallNode* cn) override { - ExprVisitor::VisitExpr_(cn); - AssignReturnSid(GetRef(cn)); + void VisitExpr_(const GlobalVarNode* op) final { + // Do nothing. } - void VisitExpr_(const LetNode* op) override { VisitExpr(op->body); } + void VisitExpr_(const OpNode* op) final { + // Do nothing. + } + + void VisitExpr_(const TupleNode* op) final { + StorageInfo field_sid; + Expr expr = GetRef(op); + for (Expr field : op->fields) { + auto sid = GetSid(field); + field_sid.ids.insert(field_sid.ids.end(), sid.ids.begin(), sid.ids.end()); + field_sid.dev_types.insert(field_sid.dev_types.end(), sid.dev_types.begin(), + sid.dev_types.end()); + field_sid.sizes_bytes.insert(field_sid.sizes_bytes.end(), sid.sizes_bytes.begin(), + sid.sizes_bytes.end()); + } + + storage_device_map_[expr] = field_sid; + AssignReturnSid(expr); + } - void VisitExpr_(const TupleNode* tn) override { - ExprVisitor::VisitExpr_(tn); - AssignReturnSid(GetRef(tn)); + void VisitExpr_(const TupleGetItemNode* op) final { + Expr expr = GetRef(op); + const auto& sid = GetSid(op->tuple); + ICHECK_LT(static_cast(op->index), sid.ids.size()); + storage_device_map_[expr].ids = {sid.ids[op->index]}; + storage_device_map_[expr].sizes_bytes = {sid.sizes_bytes[op->index]}; + storage_device_map_[expr].dev_types = {sid.dev_types[op->index]}; + AssignReturnSid(expr); } + void VisitExpr_(const IfNode* op) final { LOG(FATAL) << "if is not supported."; } + + void VisitExpr_(const LetNode* op) final { LOG(FATAL) << "if is not supported."; } + private: - Map> storage_device_map_; - IntegerArray return_sid_; + void AssignReturnSid(Expr e) { + auto iter = storage_device_map_.find(e); + if (iter != storage_device_map_.end()) { + return_ids_ = (*iter).second.ids; + } + } + /*! + * \brief ceil(size/word_size) to get number of words. + * \param size The original size. + * \param word_size The element size. + */ + static size_t DivRoundUp(size_t size, size_t word_size) { + return (size + word_size - 1) / word_size; + } + /*! + * \brief Get the memory requirement. + * \param prototype The prototype token. + * \return The required memory size. + */ + size_t GetMemorySize(const TensorTypeNode* ttype) { + ICHECK(ttype != nullptr); + size_t size = 1; + for (IndexExpr dim : ttype->shape) { + const int64_t* pval = tir::as_const_int(dim); + ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape; + ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval; + size *= static_cast(pval[0]); + } + size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8); + return size; + } + /*! + * \brief Get the necessary token. + * \param expr The expression. + * \return The corresponding token. + */ + StorageInfo GetSid(const Expr& expr) { + this->VisitExpr(expr); + auto it = storage_device_map_.find(expr); + ICHECK(it != storage_device_map_.end()); + return it->second; + } + + void CreateSid(const ExprNode* op) { + StorageInfo sid; + Expr expr = GetRef(op); + int device_type = node_device_map_.count(GetRef(op)) ? node_device_map_[expr]->value : 0; + if (const auto* tuple_type = op->checked_type().as()) { + for (Type t : tuple_type->fields) { + const auto* ttype = t.as(); + ICHECK(ttype); + sid.ids.push_back(sid_++); + sid.dev_types.push_back(device_type); + sid.sizes_bytes.push_back(GetMemorySize(ttype)); + } + } else { + const auto* ttype = op->checked_type().as(); + ICHECK(ttype); + sid.ids.push_back(sid_++); + sid.dev_types.push_back(device_type); + sid.sizes_bytes.push_back(GetMemorySize(ttype)); + } + storage_device_map_[expr] = sid; + } + /*! \brief mapping of expression -> storageInfo*/ + StorageMap storage_device_map_; + /*! \brief mapping of expression -> device type*/ + Map node_device_map_; + /*! \brief current id of the temporary allocated*/ + int sid_{0}; + /*! \brief the set of identifiers that are return variables */ + std::vector return_ids_; }; /*! \brief Code generator for AOT executor */ @@ -120,14 +248,14 @@ class AOTExecutorCodegen : public ExprVisitor { * \brief Return a vector of variables that represents the sids for the given Relay Expr */ std::vector PackSid(Expr expr) { - Array sids = storage_device_map_[expr]; + auto sids = storage_device_map_[expr]; std::vector sid_vars; // Note that an expression can have multiple sids associated with it // e.g., returning multiple values from a function - for (const auto& sid : sids[0]) { + for (const auto& sid : sids.ids) { // Determine if an sid is an output buffer - int sid_int = static_cast((sid.as())->value); + int sid_int = sid; auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid_int); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); @@ -390,8 +518,8 @@ class AOTExecutorCodegen : public ExprVisitor { } ICHECK_GE(storage_device_map_.count(expr), 0); - auto& device_type = storage_device_map_[expr][1]; - auto call_dev_type = device_type[0]->value; + auto& device_type = storage_device_map_[expr].dev_types; + auto call_dev_type = device_type[0]; // Normal Relay Function if (targets_.size() == 1) { // homogeneous execution. @@ -428,14 +556,14 @@ class AOTExecutorCodegen : public ExprVisitor { // If the Var node is an output node we need to copy the content of the variable to the output // It's safe to check the SID here because Var StorageToken are never reallocated - Array sids = storage_device_map_[expr]; + auto sids = storage_device_map_[expr]; - auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), - static_cast((sids[0][0].as())->value)); + auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sids.ids[0]); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); auto var_expr = FindExpr(expr); - CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0], sids[2][0]); + CopyToOutput(main_signature_[input_vars_.size() + output_index], var_expr[0], + sids.sizes_bytes[0]); } } @@ -444,18 +572,18 @@ class AOTExecutorCodegen : public ExprVisitor { size_t index = params_.size(); std::string name = "p" + std::to_string(index); - param_storage_ids_[name] = storage_device_map_[expr][0][0]->value; + param_storage_ids_[name] = storage_device_map_[expr].ids[0]; params_[name] = op->data; params_by_expr_.Set(expr, name); // If the Constant node is an output node we need to copy the content of the parameter to the // output A Var node can only produce a single output - Array sids = storage_device_map_[expr]; - auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), - static_cast((sids[0][0].as())->value)); + auto sids = storage_device_map_[expr]; + auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sids.ids[0]); if (output_iter != return_sid_.end()) { int output_index = std::distance(return_sid_.begin(), output_iter); - CopyToOutput(main_signature_[input_vars_.size() + output_index], PackParam(expr), sids[2][0]); + CopyToOutput(main_signature_[input_vars_.size() + output_index], PackParam(expr), + sids.sizes_bytes[0]); } } @@ -511,9 +639,9 @@ class AOTExecutorCodegen : public ExprVisitor { continue; } - for (unsigned int i = 0; i < kv.second[0].size(); i++) { - int size = kv.second[2][i]; - int sid = static_cast((kv.second[0][i].as())->value); + for (unsigned int i = 0; i < kv.second.ids.size(); i++) { + int size = kv.second.sizes_bytes[i]; + int sid = kv.second.ids[i]; if (std::find(return_sid_.begin(), return_sid_.end(), sid) != return_sid_.end()) { continue; @@ -523,6 +651,8 @@ class AOTExecutorCodegen : public ExprVisitor { // so we don't pay the price of allocation for every inference if (!allocated[sid]) { body = tir::Allocate(sids_table_[sid], DataType::Int(8), {size}, tir::const_true(), body); + body = tir::AttrStmt(sids_table_[sid], tir::attr::storage_scope, tir::StringImm("global"), + body); } allocated[sid] = true; } @@ -566,7 +696,8 @@ class AOTExecutorCodegen : public ExprVisitor { std::unordered_map param_storage_ids_; /*! \brief plan memory of device result */ - Map> storage_device_map_; + StorageMap storage_device_map_; + /*! \brief mapping sid -> tir::Var */ std::unordered_map sids_table_; /*! \brief lowered funcs */ std::unordered_map lowered_funcs_; @@ -577,7 +708,7 @@ class AOTExecutorCodegen : public ExprVisitor { /*! \brief the set of statements that make the program */ std::vector stmts_; /*! \brief the list of return sids (note that the function might return more then one output */ - IntegerArray return_sid_; + std::vector return_sid_; public: AOTExecutorCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host) @@ -588,9 +719,11 @@ class AOTExecutorCodegen : public ExprVisitor { } LoweredOutput Codegen(relay::Function func) { - // Get the module, storage map and token sizes - auto pf = GetPackedFunc("relay.backend.GraphPlanMemory"); - storage_device_map_ = (*pf)(func); + auto aot_allocator = AOTOnDemandAllocator(); + aot_allocator.Run(func); + + // Retrieve the storage map + storage_device_map_ = aot_allocator.GetStorageMap(); int input_index = 0; for (auto input : func->params) { @@ -600,14 +733,14 @@ class AOTExecutorCodegen : public ExprVisitor { // Define the storage allocator ids for (auto kv : storage_device_map_) { - for (const auto& sid : kv.second[0]) { + for (const auto& sid : kv.second.ids) { te::Var sid_var(MakeString("sid_", sid), PointerType(PrimType(DataType::Int(8)))); sids_table_[sid] = sid_var; } } - // Find the return sid - return_sid_ = AotReturnSidVisitor(storage_device_map_).FindReturnSid(func); + // Retrieve the return sids + return_sid_ = aot_allocator.GetReturnIds(); for (unsigned int output_index = 0; output_index < return_sid_.size(); output_index++) { main_signature_.push_back(tir::Var(MakeString("output_", output_index), DataType::Handle())); } @@ -635,14 +768,21 @@ class AOTExecutorCodegen : public ExprVisitor { } ret.external_mods = compile_engine_->LowerExternalFunctions(); + // Build the TIR IRModule + Map symbol_map; + symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func); + IRModule mod_run(symbol_map); + + // Apply storage rewrite pass to the runner function to do memory planning + auto storage_rewrite = tir::transform::StorageRewrite(); + mod_run = storage_rewrite(mod_run); + + // Update the lowered functions auto target_host_str = target_host_->str(); if (ret.lowered_funcs.find(target_host_str) != ret.lowered_funcs.end()) { - ret.lowered_funcs[target_host_str]->Add( - GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func); + ret.lowered_funcs[target_host_str]->Update(mod_run); } else { - Map symbol_map; - symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func); - ret.lowered_funcs.Set(target_host_str, IRModule(symbol_map)); + ret.lowered_funcs.Set(target_host_str, mod_run); } ret.function_metadata = std::move(function_metadata_); ret.metadata = diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index 8c7aefe70d091..e7358828c8f81 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -36,6 +36,46 @@ from tvm.micro import export_model_library_format +def convert_to_relay( + tflite_model_buf, + input_data, + input_node, +): + """ Convert a tflite model buffer in a Relay module """ + + def convert_to_list(x): + if not isinstance(x, list): + x = [x] + return x + + # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1 + try: + import tflite.Model + + tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) + except AttributeError: + import tflite + + tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0) + except ImportError: + raise ImportError("The tflite package must be installed") + + input_data = convert_to_list(input_data) + input_node = convert_to_list(input_node) + + shape_dict = {} + dtype_dict = {} + for i, e in enumerate(input_node): + shape_dict[e] = input_data[i].shape + dtype_dict[e] = input_data[i].dtype.name + + mod, params = relay.frontend.from_tflite( + tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict + ) + mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params) + return mod, params + + def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout): """ This method runs a process and logs the output to both a log file and stdout diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py index 02b4de3a64f34..356813be6e55b 100644 --- a/tests/python/relay/aot/test_crt_aot.py +++ b/tests/python/relay/aot/test_crt_aot.py @@ -364,5 +364,27 @@ def test_byoc_utvm(use_calculated_workspaces): compile_and_run(mod, input_list, output_list, use_calculated_workspaces) +def test_quant_mobilenet_tfl(): + pytest.importorskip("tflite") + + import tvm.relay.testing.tf as tf_testing + + tflite_model_file = tf_testing.get_workload_official( + "https://storage.googleapis.com/download.tensorflow.org/" + "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", + "mobilenet_v1_1.0_224_quant.tflite", + ) + with open(tflite_model_file, "rb") as f: + tflite_model_buf = f.read() + data_shape = (1, 224, 224, 3) + in_min, in_max = (0, 255) + data = np.random.randint(in_min, high=in_max, size=data_shape, dtype="uint8") + mod, params = convert_to_relay(tflite_model_buf, data, "input") + inputs = {"input": data} + output_list = generate_ref_data(mod, inputs, params) + input_list = [inputs["input"]] + compile_and_run(mod, input_list, output_list, True, params) + + if __name__ == "__main__": pytest.main([__file__])