Skip to content

Commit

Permalink
Decoupling AOT from graph memory planner
Browse files Browse the repository at this point in the history
In this PR we are decoupling AOT from the Graph Memory Planner. Since
AOT has the runner expressed in TIR we can get rid of the GMP in relay
and use the Storage Rewrite Pass to do memory planning on the runner
function. This also sorts out the issue mentioned in apache#8062

Change-Id: I6e33fadbf0462edf0366ee37e84ffde26123d3cb
  • Loading branch information
Giuseppe Rossini committed May 20, 2021
1 parent dbd076a commit d7ecb6d
Show file tree
Hide file tree
Showing 3 changed files with 250 additions and 25 deletions.
215 changes: 190 additions & 25 deletions src/relay/backend/aot_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <tvm/tir/expr.h>
#include <tvm/tir/function.h>
#include <tvm/tir/stmt.h>
#include <tvm/tir/transform.h>

#include <algorithm>
#include <list>
Expand All @@ -46,13 +47,171 @@ namespace backend {

using IntegerArray = Array<Integer>;
using TargetsMap = std::unordered_map<int, Target>;
using StorageMap = std::unordered_map<Expr, std::vector<std::vector<int>>, runtime::ObjectPtrHash,
runtime::ObjectPtrEqual>;
/**
* This is an on demand allocator for AOT. A new temporary
* (storage allocator identifier) is allocated for each operation.
*/
class AOTOnDemandAllocator : public ExprVisitor {
public:
// run the visitor on a function.
StorageMap Run(const Function& func) {
node_device_map_ = CollectDeviceInfo(func);

for (Expr param : func->params) {
CreateSid(param.operator->());
}

GetSid(func->body);
return storage_device_map_;
}

void VisitExpr_(const ConstantNode* op) final { CreateSid(op); }

void VisitExpr_(const CallNode* op) final {
// create token for the call node.
CreateSid(op);
for (Expr arg : op->args) {
GetSid(arg);
}
}

void VisitExpr_(const VarNode* op) final {
// Do nothing.
}

void VisitExpr_(const FunctionNode* op) final {
// do not recurse into sub function.
}

void VisitExpr_(const GlobalVarNode* op) final {
// Do nothing.
}

void VisitExpr_(const OpNode* op) final {
// Do nothing.
}

void VisitExpr_(const TupleNode* op) final {
std::vector<int> field_ids;
std::vector<int> field_sizes;
std::vector<int> field_types;
Expr expr = GetRef<Expr>(op);
for (Expr field : op->fields) {
auto sids = GetSid(field);
field_ids.insert(field_ids.end(), sids[0].begin(), sids[0].end());
field_types.insert(field_types.end(), sids[1].begin(), sids[1].end());
field_sizes.insert(field_sizes.end(), sids[2].begin(), sids[2].end());
}
if (storage_device_map_[expr].empty()) {
InitStorage(expr);
}
storage_device_map_[expr][0] = field_ids;
storage_device_map_[expr][1] = field_sizes;
storage_device_map_[expr][2] = field_types;
}

void VisitExpr_(const TupleGetItemNode* op) final {
Expr expr = GetRef<Expr>(op);
const auto& sids = GetSid(op->tuple);
ICHECK_LT(static_cast<size_t>(op->index), sids.size());
if (storage_device_map_[expr].empty()) {
InitStorage(expr);
}
storage_device_map_[expr][0] = {sids[0][op->index]};
storage_device_map_[expr][1] = {sids[1][op->index]};
storage_device_map_[expr][2] = {sids[2][op->index]};
}

void VisitExpr_(const IfNode* op) final { LOG(FATAL) << "if is not supported."; }

void VisitExpr_(const LetNode* op) final { LOG(FATAL) << "if is not supported."; }

private:
/*!
* \brief ceil(size/word_size) to get number of words.
* \param size The original size.
* \param word_size The element size.
*/
static size_t DivRoundUp(size_t size, size_t word_size) {
return (size + word_size - 1) / word_size;
}
/*!
* \brief Get the memory requirement.
* \param prototype The prototype token.
* \return The required memory size.
*/
size_t GetMemorySize(const TensorTypeNode* ttype) {
ICHECK(ttype != nullptr);
size_t size = 1;
for (IndexExpr dim : ttype->shape) {
const int64_t* pval = tir::as_const_int(dim);
ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
size *= static_cast<size_t>(pval[0]);
}
size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
return size;
}
/*!
* \brief Get the necessary token.
* \param expr The expression.
* \return The corresponding token.
*/
std::vector<std::vector<int>> GetSid(const Expr& expr) {
this->VisitExpr(expr);
auto it = storage_device_map_.find(expr);
ICHECK(it != storage_device_map_.end());
return it->second;
}

void CreateSid(const ExprNode* op) {
std::vector<int> sids;
std::vector<int> sizes;
std::vector<int> types;
Expr expr = GetRef<Expr>(op);
int device_type = node_device_map_.count(GetRef<Expr>(op)) ? node_device_map_[expr]->value : 0;
if (const auto* tuple_type = op->checked_type().as<TupleTypeNode>()) {
for (Type t : tuple_type->fields) {
const auto* ttype = t.as<TensorTypeNode>();
ICHECK(ttype);
sids.push_back(sid_++);
types.push_back(device_type);
sizes.push_back(GetMemorySize(ttype));
}
} else {
const auto* ttype = op->checked_type().as<TensorTypeNode>();
ICHECK(ttype);
sids.push_back(sid_++);
types.push_back(device_type);
sizes.push_back(GetMemorySize(ttype));
}
InitStorage(expr);
storage_device_map_[expr][0] = sids;
storage_device_map_[expr][1] = types;
storage_device_map_[expr][2] = sizes;
}

void InitStorage(Expr expr) {
if (storage_device_map_[expr].empty()) {
storage_device_map_[expr].push_back(std::vector<int>());
storage_device_map_[expr].push_back(std::vector<int>());
storage_device_map_[expr].push_back(std::vector<int>());
}
}

StorageMap storage_device_map_;
Map<Expr, Integer> node_device_map_;
int sid_{0};
};

class AotReturnSidVisitor : public ExprVisitor {
public:
explicit AotReturnSidVisitor(Map<Expr, Array<IntegerArray>> storage_device_map)
explicit AotReturnSidVisitor(StorageMap storage_device_map)
: storage_device_map_{storage_device_map}, return_sid_{-1} {}

IntegerArray FindReturnSid(Function func) {
std::vector<int> FindReturnSid(Function func) {
VisitExpr(func->body);
return return_sid_;
}
Expand Down Expand Up @@ -88,8 +247,8 @@ class AotReturnSidVisitor : public ExprVisitor {
}

private:
Map<Expr, Array<IntegerArray>> storage_device_map_;
IntegerArray return_sid_;
StorageMap storage_device_map_;
std::vector<int> return_sid_;
};

/*! \brief Code generator for AOT executor */
Expand Down Expand Up @@ -120,14 +279,14 @@ class AOTExecutorCodegen : public ExprVisitor {
* \brief Return a vector of variables that represents the sids for the given Relay Expr
*/
std::vector<tir::Var> PackSid(Expr expr) {
Array<IntegerArray> sids = storage_device_map_[expr];
auto sids = storage_device_map_[expr];
std::vector<tir::Var> sid_vars;

// Note that an expression can have multiple sids associated with it
// e.g., returning multiple values from a function
for (const auto& sid : sids[0]) {
// Determine if an sid is an output buffer
int sid_int = static_cast<int>((sid.as<IntImmNode>())->value);
int sid_int = sid;
auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sid_int);
if (output_iter != return_sid_.end()) {
int output_index = std::distance(return_sid_.begin(), output_iter);
Expand Down Expand Up @@ -391,7 +550,7 @@ class AOTExecutorCodegen : public ExprVisitor {

ICHECK_GE(storage_device_map_.count(expr), 0);
auto& device_type = storage_device_map_[expr][1];
auto call_dev_type = device_type[0]->value;
auto call_dev_type = device_type[0];
// Normal Relay Function
if (targets_.size() == 1) {
// homogeneous execution.
Expand Down Expand Up @@ -428,10 +587,9 @@ class AOTExecutorCodegen : public ExprVisitor {

// If the Var node is an output node we need to copy the content of the variable to the output
// It's safe to check the SID here because Var StorageToken are never reallocated
Array<IntegerArray> sids = storage_device_map_[expr];
auto sids = storage_device_map_[expr];

auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sids[0][0]);
if (output_iter != return_sid_.end()) {
int output_index = std::distance(return_sid_.begin(), output_iter);
auto var_expr = FindExpr(expr);
Expand All @@ -444,15 +602,14 @@ class AOTExecutorCodegen : public ExprVisitor {
size_t index = params_.size();
std::string name = "p" + std::to_string(index);

param_storage_ids_[name] = storage_device_map_[expr][0][0]->value;
param_storage_ids_[name] = storage_device_map_[expr][0][0];
params_[name] = op->data;
params_by_expr_.Set(expr, name);

// If the Constant node is an output node we need to copy the content of the parameter to the
// output A Var node can only produce a single output
Array<IntegerArray> sids = storage_device_map_[expr];
auto output_iter = std::find(return_sid_.begin(), return_sid_.end(),
static_cast<int>((sids[0][0].as<IntImmNode>())->value));
auto sids = storage_device_map_[expr];
auto output_iter = std::find(return_sid_.begin(), return_sid_.end(), sids[0][0]);
if (output_iter != return_sid_.end()) {
int output_index = std::distance(return_sid_.begin(), output_iter);
CopyToOutput(main_signature_[input_vars_.size() + output_index], PackParam(expr), sids[2][0]);
Expand Down Expand Up @@ -513,7 +670,7 @@ class AOTExecutorCodegen : public ExprVisitor {

for (unsigned int i = 0; i < kv.second[0].size(); i++) {
int size = kv.second[2][i];
int sid = static_cast<int>((kv.second[0][i].as<IntImmNode>())->value);
int sid = kv.second[0][i];

if (std::find(return_sid_.begin(), return_sid_.end(), sid) != return_sid_.end()) {
continue;
Expand All @@ -523,6 +680,8 @@ class AOTExecutorCodegen : public ExprVisitor {
// so we don't pay the price of allocation for every inference
if (!allocated[sid]) {
body = tir::Allocate(sids_table_[sid], DataType::Int(8), {size}, tir::const_true(), body);
body = tir::AttrStmt(sids_table_[sid], tir::attr::storage_scope, tir::StringImm("global"),
body);
}
allocated[sid] = true;
}
Expand Down Expand Up @@ -566,7 +725,8 @@ class AOTExecutorCodegen : public ExprVisitor {
std::unordered_map<std::string, int64_t> param_storage_ids_;

/*! \brief plan memory of device result */
Map<Expr, Array<IntegerArray>> storage_device_map_;
StorageMap storage_device_map_;
/*! \brief mapping sid -> tir::Var */
std::unordered_map<int, te::Var> sids_table_;
/*! \brief lowered funcs */
std::unordered_map<std::string, IRModule> lowered_funcs_;
Expand All @@ -577,7 +737,7 @@ class AOTExecutorCodegen : public ExprVisitor {
/*! \brief the set of statements that make the program */
std::vector<tir::Stmt> stmts_;
/*! \brief the list of return sids (note that the function might return more then one output */
IntegerArray return_sid_;
std::vector<int> return_sid_;

public:
AOTExecutorCodegen(runtime::Module* mod, const TargetsMap& targets, Target target_host)
Expand All @@ -588,9 +748,7 @@ class AOTExecutorCodegen : public ExprVisitor {
}

LoweredOutput Codegen(relay::Function func) {
// Get the module, storage map and token sizes
auto pf = GetPackedFunc("relay.backend.GraphPlanMemory");
storage_device_map_ = (*pf)(func);
storage_device_map_ = AOTOnDemandAllocator().Run(func);

int input_index = 0;
for (auto input : func->params) {
Expand Down Expand Up @@ -635,14 +793,21 @@ class AOTExecutorCodegen : public ExprVisitor {
}
ret.external_mods = compile_engine_->LowerExternalFunctions();

// Build the TIR IRModule
Map<GlobalVar, BaseFunc> symbol_map;
symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func);
IRModule mod_run(symbol_map);

// Apply storage rewrite pass to the runner function to do memory planning
auto storage_rewrite = tir::transform::StorageRewrite();
mod_run = storage_rewrite(mod_run);

// Update the lowered functions
auto target_host_str = target_host_->str();
if (ret.lowered_funcs.find(target_host_str) != ret.lowered_funcs.end()) {
ret.lowered_funcs[target_host_str]->Add(
GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func);
ret.lowered_funcs[target_host_str]->Update(mod_run);
} else {
Map<GlobalVar, BaseFunc> symbol_map;
symbol_map.Set(GlobalVar(::tvm::runtime::symbol::tvm_run_func_prefix), prim_func);
ret.lowered_funcs.Set(target_host_str, IRModule(symbol_map));
ret.lowered_funcs.Set(target_host_str, mod_run);
}
ret.function_metadata = std::move(function_metadata_);
ret.metadata =
Expand Down
40 changes: 40 additions & 0 deletions tests/python/relay/aot/aot_test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,46 @@
from tvm.micro import export_model_library_format


def convert_to_relay(
tflite_model_buf,
input_data,
input_node,
):
""" Convert a tflite model buffer in a Relay module """

def convert_to_list(x):
if not isinstance(x, list):
x = [x]
return x

# TFLite.Model.Model has changed to TFLite.Model from 1.14 to 2.1
try:
import tflite.Model

tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
except AttributeError:
import tflite

tflite_model = tflite.Model.GetRootAsModel(tflite_model_buf, 0)
except ImportError:
raise ImportError("The tflite package must be installed")

input_data = convert_to_list(input_data)
input_node = convert_to_list(input_node)

shape_dict = {}
dtype_dict = {}
for i, e in enumerate(input_node):
shape_dict[e] = input_data[i].shape
dtype_dict[e] = input_data[i].dtype.name

mod, params = relay.frontend.from_tflite(
tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict
)
mod["main"] = relay.build_module.bind_params_by_name(mod["main"], params)
return mod, params


def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout):
"""
This method runs a process and logs the output to both a log file and stdout
Expand Down
20 changes: 20 additions & 0 deletions tests/python/relay/aot/test_crt_aot.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,5 +364,25 @@ def test_byoc_utvm(use_calculated_workspaces):
compile_and_run(mod, input_list, output_list, use_calculated_workspaces)


def test_quant_mobilenet_tfl():
import tvm.relay.testing.tf as tf_testing

tflite_model_file = tf_testing.get_workload_official(
"https://storage.googleapis.com/download.tensorflow.org/"
"models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
"mobilenet_v1_1.0_224_quant.tflite",
)
with open(tflite_model_file, "rb") as f:
tflite_model_buf = f.read()
data_shape = (1, 224, 224, 3)
in_min, in_max = (0, 255)
data = np.random.randint(in_min, high=in_max, size=data_shape, dtype="uint8")
mod, params = convert_to_relay(tflite_model_buf, data, "input")
inputs = {"input": data}
output_list = generate_ref_data(mod, inputs, params)
input_list = [inputs["input"]]
compile_and_run(mod, input_list, output_list, True, params)


if __name__ == "__main__":
pytest.main([__file__])

0 comments on commit d7ecb6d

Please sign in to comment.