Decoupling AOT from graph memory planner

In this PR we are decoupling AOT from the Graph Memory Planner. Since AOT has the runner expressed in TIR we can get rid of the GMP in relay and use the Storage Rewrite Pass to do memory planning on the runner function. This also sorts out the issue mentioned in apache#8062 Change-Id: I6e33fadbf0462edf0366ee37e84ffde26123d3cb
giuseros · May 20, 2021 · d7ecb6d · d7ecb6d
1 parent dbd076a
commit d7ecb6d
Show file tree

Hide file tree

Showing 3 changed files with 250 additions and 25 deletions.
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
@@ -31,6 +31,7 @@
 #include <tvm/tir/expr.h>
 #include <tvm/tir/function.h>
 #include <tvm/tir/stmt.h>
+#include <tvm/tir/transform.h>
 
 #include <algorithm>
 #include <list>
@@ -46,13 +47,171 @@ namespace backend {
 
 using IntegerArray = Array<Integer>;
 using TargetsMap = std::unordered_map<int, Target>;
+using StorageMap = std::unordered_map<Expr, std::vector<std::vector<int>>, runtime::ObjectPtrHash,
+                                      runtime::ObjectPtrEqual>;
+/**
+ * This is an on demand allocator for AOT. A new temporary
+ * (storage allocator identifier) is allocated for each operation.
+ */
+class AOTOnDemandAllocator : public ExprVisitor {
+ public:
+  // run the visitor on a function.
+  StorageMap Run(const Function& func) {
+    node_device_map_ = CollectDeviceInfo(func);
+
+    for (Expr param : func->params) {
+      CreateSid(param.operator->());
+    }
+
+    GetSid(func->body);
+    return storage_device_map_;
+  }
+
+  void VisitExpr_(const ConstantNode* op) final { CreateSid(op); }
+
+  void VisitExpr_(const CallNode* op) final {
+    // create token for the call node.
+    CreateSid(op);
+    for (Expr arg : op->args) {
+      GetSid(arg);
+    }
+  }
+
+  void VisitExpr_(const VarNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const FunctionNode* op) final {
+    // do not recurse into sub function.
+  }
+
+  void VisitExpr_(const GlobalVarNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const OpNode* op) final {
+    // Do nothing.
+  }
+
+  void VisitExpr_(const TupleNode* op) final {
+    std::vector<int> field_ids;
+    std::vector<int> field_sizes;
+    std::vector<int> field_types;
+    Expr expr = GetRef<Expr>(op);
+    for (Expr field : op->fields) {
+      auto sids = GetSid(field);
+      field_ids.insert(field_ids.end(), sids[0].begin(), sids[0].end());
+      field_types.insert(field_types.end(), sids[1].begin(), sids[1].end());
+      field_sizes.insert(field_sizes.end(), sids[2].begin(), sids[2].end());
+    }
+    if (storage_device_map_[expr].empty()) {
+      InitStorage(expr);
+    }
+    storage_device_map_[expr][0] = field_ids;
+    storage_device_map_[expr][1] = field_sizes;
+    storage_device_map_[expr][2] = field_types;
+  }
+
+  void VisitExpr_(const TupleGetItemNode* op) final {
+    Expr expr = GetRef<Expr>(op);
+    const auto& sids = GetSid(op->tuple);
+    ICHECK_LT(static_cast<size_t>(op->index), sids.size());
+    if (storage_device_map_[expr].empty()) {
+      InitStorage(expr);
+    }
+    storage_device_map_[expr][0] = {sids[0][op->index]};
+    storage_device_map_[expr][1] = {sids[1][op->index]};
+    storage_device_map_[expr][2] = {sids[2][op->index]};
+  }
+
+  void VisitExpr_(const IfNode* op) final { LOG(FATAL) << "if is not supported."; }
+
+  void VisitExpr_(const LetNode* op) final { LOG(FATAL) << "if is not supported."; }
+
+ private:
+  /*!
+   * \brief ceil(size/word_size) to get number of words.
+   * \param size The original size.
+   * \param word_size The element size.
+   */
+  static size_t DivRoundUp(size_t size, size_t word_size) {
+    return (size + word_size - 1) / word_size;
+  }
+  /*!
+   * \brief Get the memory requirement.
+   * \param prototype The prototype token.
+   * \return The required memory size.
+   */
+  size_t GetMemorySize(const TensorTypeNode* ttype) {
+    ICHECK(ttype != nullptr);
+    size_t size = 1;
+    for (IndexExpr dim : ttype->shape) {
+      const int64_t* pval = tir::as_const_int(dim);
+      ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
+      ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
+      size *= static_cast<size_t>(pval[0]);
+    }
+    size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
+    return size;
+  }
+  /*!
+   * \brief Get the necessary token.
+   * \param expr The expression.
+   * \return The corresponding token.
+   */
+  std::vector<std::vector<int>> GetSid(const Expr& expr) {
+    this->VisitExpr(expr);
+    auto it = storage_device_map_.find(expr);
+    ICHECK(it != storage_device_map_.end());
+    return it->second;
+  }
+
+  void CreateSid(const ExprNode* op) {
+    std::vector<int> sids;
+    std::vector<int> sizes;
+    std::vector<int> types;
+    Expr expr = GetRef<Expr>(op);
+    int device_type = node_device_map_.count(GetRef<Expr>(op)) ? node_device_map_[expr]->value : 0;
+    if (const auto* tuple_type = op->checked_type().as<TupleTypeNode>()) {
+      for (Type t : tuple_type->fields) {
+        const auto* ttype = t.as<TensorTypeNode>();
+        ICHECK(ttype);
+        sids.push_back(sid_++);
+        types.push_back(device_type);
+        sizes.push_back(GetMemorySize(ttype));
+      }
+    } else {
+      const auto* ttype = op->checked_type().as<TensorTypeNode>();
+      ICHECK(ttype);
+      sids.push_back(sid_++);
+      types.push_back(device_type);
+      sizes.push_back(GetMemorySize(ttype));
+    }
+    InitStorage(expr);
+    storage_device_map_[expr][0] = sids;
+    storage_device_map_[expr][1] = types;
+    storage_device_map_[expr][2] = sizes;
+  }
+
+  void InitStorage(Expr expr) {
+    if (storage_device_map_[expr].empty()) {
+      storage_device_map_[expr].push_back(std::vector<int>());
+      storage_device_map_[expr].push_back(std::vector<int>());
+      storage_device_map_[expr].push_back(std::vector<int>());
+    }
+  }
+
+  StorageMap storage_device_map_;
+  Map<Expr, Integer> node_device_map_;
+  int sid_{0};
+};
 
 class AotReturnSidVisitor : public ExprVisitor {
  public:
-  explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> storage_device_map)
+  explicit AotReturnSidVisitor(StorageMap storage_device_map)
       : storage_device_map_{storage_device_map}, return_sid_{-1} {}
 
-  IntegerArray FindReturnSid(Function func) {
+  std::vector<int> FindReturnSid(Function func) {
     VisitExpr(func->body);
     return return_sid_;
   }
@@ -88,8 +247,8 @@ class AotReturnSidVisitor : public ExprVisitor {
   }
 
  private:
-  Map<Expr, Array<IntegerArray>> storage_device_map_;
-  IntegerArray return_sid_;
+  StorageMap storage_device_map_;
+  std::vector<int> return_sid_;
 };
 
 /*! \brief Code generator for AOT executor */
@@ -120,14 +279,14 @@ class AOTExecutorCodegen : public ExprVisitor {
    * \brief Return a vector of variables that represents the sids for the given Relay Expr
    */
   std::vector<tir::Var> PackSid(Expr expr) {
-    Array<IntegerArray> sids = storage_device_map_[expr];
+    auto sids = storage_device_map_[expr];
     std::vector<tir::Var> sid_vars;
 
     // Note that an expression can have multiple sids associated with it
     // e.g., returning multiple values from a function
     for (const auto& sid : sids[0]) {
       // Determine if an sid is an output buffer
-      int sid_int = static_cast<int>((sid.as<IntImmNode>())->value);
+      int sid_int = sid;
       auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid_int);
       if (output_iter != return_sid_.end()) {
         int output_index = std::distance(return_sid_.begin(), output_iter);
@@ -391,7 +550,7 @@ class AOTExecutorCodegen : public ExprVisitor {
 
     ICHECK_GE(storage_device_map_.count(expr), 0);
     auto& device_type = storage_device_map_[expr][1];
-    auto call_dev_type = device_type[0]->value;
+    auto call_dev_type = device_type[0];
     // Normal Relay Function
     if (targets_.size() == 1) {
       // homogeneous execution.
@@ -428,10 +587,9 @@ class AOTExecutorCodegen : public ExprVisitor {
 
     // If the Var node is an output node we need to copy the content of the variable to the output
     // It's safe to check the SID here because Var StorageToken are never reallocated
-    Array<IntegerArray> sids = storage_device_map_[expr];
+    auto sids = storage_device_map_[expr];
 
-    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
-                                 static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sids[0][0]);
     if (output_iter != return_sid_.end()) {
       int output_index = std::distance(return_sid_.begin(), output_iter);
       auto var_expr = FindExpr(expr);
@@ -444,15 +602,14 @@ class AOTExecutorCodegen : public ExprVisitor {
     size_t index = params_.size();
     std::string name = "p" + std::to_string(index);
 
-    param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
+    param_storage_ids_[name] = storage_device_map_[expr][0][0];
     params_[name] = op->data;
     params_by_expr_.Set(expr, name);
 
     // If the Constant node is an output node we need to copy the content of the parameter to the
     // output A Var node can only produce a single output
-    Array<IntegerArray> sids = storage_device_map_[expr];
-    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
-                                 static_cast<int>((sids[0][0].as<IntImmNode>())->value));
+    auto sids = storage_device_map_[expr];
+    auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sids[0][0]);
     if (output_iter != return_sid_.end()) {
       int output_index = std::distance(return_sid_.begin(), output_iter);
       CopyToOutput(main_signature_[input_vars_.size() + output_index], PackParam(expr), sids[2][0]);
@@ -513,7 +670,7 @@ class AOTExecutorCodegen : public ExprVisitor {
 
       for (unsigned int i = 0; i < kv.second[0].size(); i++) {
         int size = kv.second[2][i];
-        int sid = static_cast<int>((kv.second[0][i].as<IntImmNode>())->value);
+        int sid = kv.second[0][i];
 
         if (std::find(return_sid_.begin(), return_sid_.end(), sid) != return_sid_.end()) {
           continue;
@@ -523,6 +680,8 @@ class AOTExecutorCodegen : public ExprVisitor {
         // so we don't pay the price of allocation for every inference
         if (!allocated[sid]) {
           body = tir::Allocate(sids_table_[sid], DataType::Int(8), {size}, tir::const_true(), body);
+          body = tir::AttrStmt(sids_table_[sid], tir::attr::storage_scope, tir::StringImm("global"),
+                               body);
         }
         allocated[sid] = true;
       }
@@ -566,7 +725,8 @@ class AOTExecutorCodegen : public ExprVisitor {
   std::unordered_map<std::string, int64_t> param_storage_ids_;
 
   /*! \brief plan memory of device result */
-  Map<Expr, Array<IntegerArray>> storage_device_map_;
+  StorageMap storage_device_map_;
+  /*! \brief mapping sid -> tir::Var */
   std::unordered_map<int, te::Var> sids_table_;
   /*! \brief lowered funcs */
   std::unordered_map<std::string, IRModule> lowered_funcs_;
@@ -577,7 +737,7 @@ class AOTExecutorCodegen : public ExprVisitor {
   /*! \brief the set of statements that make the program */
   std::vector<tir::Stmt> stmts_;
   /*! \brief the list of return sids (note that the function might return more then one output */
-  IntegerArray return_sid_;
+  std::vector<int> return_sid_;
 
  public:
   AOTExecutorCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host)
@@ -588,9 +748,7 @@ class AOTExecutorCodegen : public ExprVisitor {
   }
 
   LoweredOutput Codegen(relay::Function func) {
-    // Get the module, storage map and token sizes
-    auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
-    storage_device_map_ = (*pf)(func);
+    storage_device_map_ = AOTOnDemandAllocator().Run(func);
 
     int input_index = 0;
     for (auto input : func->params) {
@@ -635,14 +793,21 @@ class AOTExecutorCodegen : public ExprVisitor {
     }
     ret.external_mods = compile_engine_->LowerExternalFunctions();
 
+    // Build the TIR IRModule
+    Map<GlobalVar, BaseFunc> symbol_map;
+    symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func);
+    IRModule mod_run(symbol_map);
+
+    // Apply storage rewrite pass to the runner function to do memory planning
+    auto storage_rewrite = tir::transform::StorageRewrite();
+    mod_run = storage_rewrite(mod_run);
+
+    // Update the lowered functions
     auto target_host_str = target_host_->str();
     if (ret.lowered_funcs.find(target_host_str) != ret.lowered_funcs.end()) {
-      ret.lowered_funcs[target_host_str]->Add(
-          GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func);
+      ret.lowered_funcs[target_host_str]->Update(mod_run);
     } else {
-      Map<GlobalVar, BaseFunc> symbol_map;
-      symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func);
-      ret.lowered_funcs.Set(target_host_str, IRModule(symbol_map));
+      ret.lowered_funcs.Set(target_host_str, mod_run);
     }
     ret.function_metadata = std::move(function_metadata_);
     ret.metadata =

diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
@@ -36,6 +36,46 @@
 from tvm.micro import export_model_library_format
 
 
+def convert_to_relay(
+    tflite_model_buf,
+    input_data,
+    input_node,
+):
+    """ Convert a tflite model buffer in a Relay module """
+
+    def convert_to_list(x):
+        if not isinstance(x, list):
+            x = [x]
+        return x
+
+    # TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
+    try:
+        import tflite.Model
+
+        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
+    except AttributeError:
+        import tflite
+
+        tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
+    except ImportError:
+        raise ImportError("The tflite package must be installed")
+
+    input_data = convert_to_list(input_data)
+    input_node = convert_to_list(input_node)
+
+    shape_dict = {}
+    dtype_dict = {}
+    for i, e in enumerate(input_node):
+        shape_dict[e] = input_data[i].shape
+        dtype_dict[e] = input_data[i].dtype.name
+
+    mod, params = relay.frontend.from_tflite(
+        tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
+    )
+    mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params)
+    return mod, params
+
+
 def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout):
     """
     This method runs a process and logs the output to both a log file and stdout

diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
@@ -364,5 +364,25 @@ def test_byoc_utvm(use_calculated_workspaces):
     compile_and_run(mod, input_list, output_list, use_calculated_workspaces)
 
 
+def test_quant_mobilenet_tfl():
+    import tvm.relay.testing.tf as tf_testing
+
+    tflite_model_file = tf_testing.get_workload_official(
+        "https://storage.googleapis.com/download.tensorflow.org/"
+        "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+        "mobilenet_v1_1.0_224_quant.tflite",
+    )
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    data_shape = (1, 224, 224, 3)
+    in_min, in_max = (0, 255)
+    data = np.random.randint(in_min, high=in_max, size=data_shape, dtype="uint8")
+    mod, params = convert_to_relay(tflite_model_buf, data, "input")
+    inputs = {"input": data}
+    output_list = generate_ref_data(mod, inputs, params)
+    input_list = [inputs["input"]]
+    compile_and_run(mod, input_list, output_list, True, params)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])