Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[onert-micro] Add createBackwardExecutionPlan #13217

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,17 @@ namespace import
struct OMExecutionPlanCreator
{

// Create execution plan for forward graph
static OMStatus createExecutionPlan(core::OMRuntimeStorage &runtime_storage,
core::OMRuntimeContext &runtime_context,
core::memory::OMRuntimeAllocator &allocator,
const OMConfig &configs);

// Create execution plan for backward graph
static OMStatus createBackwardExecutionPlan(core::OMRuntimeStorage &runtime_storage,
core::OMRuntimeContext &runtime_context,
core::memory::OMRuntimeAllocator &allocator,
const OMConfig &configs);
};

} // namespace import
Expand Down
122 changes: 121 additions & 1 deletion onert-micro/onert-micro/src/import/OMExecutionPlanCreator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,37 @@ using namespace onert_micro::core;
using namespace onert_micro::import;
using namespace onert_micro;

namespace
{

// Layers with trainable weights
// Note: needed not to store some layers with const intputs but it is not trainable (for example
// Reshape)
bool isTrainableWeights(const circle::OperatorCode *opcode)
{
switch (opcode->builtin_code())
{
case circle::BuiltinOperator_FULLY_CONNECTED:
case circle::BuiltinOperator_CONV_2D:
return true;
default:
return false;
}
}

} // namespace

/*
* Create execution plan for forward graph
* TODO: describe creation execution plan logic
*/
OMStatus OMExecutionPlanCreator::createExecutionPlan(core::OMRuntimeStorage &runtime_storage,
core::OMRuntimeContext &runtime_context,
core::memory::OMRuntimeAllocator &allocator,
const OMConfig &configs)
{
bool keep_input = configs.keep_input;
bool train_mode = configs.train_mode;

std::vector<std::vector<uint16_t>> &alloc_plan = allocator.getAllocPlan();
std::vector<std::vector<uint16_t>> &dealloc_plan = allocator.getDeallocPlan();
Expand All @@ -40,6 +65,10 @@ OMStatus OMExecutionPlanCreator::createExecutionPlan(core::OMRuntimeStorage &run

const size_t num_kernels = operators->size();

uint32_t num_train_layers = configs.training_context.num_of_train_layers;
if (train_mode and num_train_layers == 0)
num_train_layers = num_kernels;

if (not keep_input)
{
auto graph_inputs = runtime_context.getCircleInputs();
Expand Down Expand Up @@ -70,7 +99,7 @@ OMStatus OMExecutionPlanCreator::createExecutionPlan(core::OMRuntimeStorage &run

if (lifetimes.count(input_index) > 0)
{
if (kernel_type == Inplace)
if (kernel_type == Inplace or train_mode and index >= (num_kernels - num_train_layers))
lifetimes.at(input_index).second = -1;
else
lifetimes.at(input_index).second = index;
Expand All @@ -83,6 +112,8 @@ OMStatus OMExecutionPlanCreator::createExecutionPlan(core::OMRuntimeStorage &run

if (kernel_type == Inplace)
lifetimes[output_index] = Lifetime(-1, index);
else if (train_mode and index >= (num_kernels - num_train_layers))
lifetimes[output_index] = Lifetime(index, -1);
else
lifetimes[output_index] = Lifetime(index, index);
}
Expand All @@ -107,3 +138,92 @@ OMStatus OMExecutionPlanCreator::createExecutionPlan(core::OMRuntimeStorage &run

return Ok;
}

/*
* Create execution plan for backward graph:
* - Allocate memory for inputs for current op using the following rules:
* 1) Don't allocate data for non const input tensor if it is last op for training (don't need to
* backpropagate result) 2) Don't allocate data for const input tensor if is non trainable const 3)
* Allocate data otherwise
* - Deallocate memory for outputs for current op using the following rules:
* 1) Deallocate all outputs tensors.
*/
OMStatus OMExecutionPlanCreator::createBackwardExecutionPlan(
core::OMRuntimeStorage &runtime_storage, core::OMRuntimeContext &runtime_context,
core::memory::OMRuntimeAllocator &allocator, const OMConfig &configs)
{
bool keep_input = configs.keep_input;
bool train_mode = configs.train_mode;
assert(train_mode);
if (train_mode == false)
return UnknownError;

std::vector<std::vector<uint16_t>> &alloc_plan = allocator.getAllocPlan();
std::vector<std::vector<uint16_t>> &dealloc_plan = allocator.getDeallocPlan();

using Lifetime = std::pair<int32_t, int32_t>;
std::map<uint16_t, Lifetime> lifetimes;

const reader::CircleOperators *operators = runtime_context.getCircleOperators();
const uint32_t num_kernels = operators->size();

uint32_t num_train_layers = configs.training_context.num_of_train_layers == 0
? num_kernels
: configs.training_context.num_of_train_layers;
auto graph_outputs = runtime_context.getCircleOutputs();

for (const auto output_ind : *graph_outputs)
{
assert(lifetimes.count(output_ind) == 0);
lifetimes[output_ind] = Lifetime(-1, 0);
}

uint32_t last_node_pos = std::min(num_kernels, num_train_layers);
const auto *op_codes = runtime_context.getCircleOpcodes();
for (int32_t index = 0; index < last_node_pos; ++index)
{
uint32_t cur_op_index = num_kernels - index - 1;
auto *cur_op = operators->operator[](cur_op_index);

uint32_t cur_opcode_index = cur_op->opcode_index();

assert(cur_opcode_index < op_codes->size());

const auto opcode = op_codes->operator[](cur_opcode_index);

const auto *op_inputs = cur_op->inputs();
const auto *op_outputs = cur_op->outputs();
for (int32_t j = 0; j < op_inputs->size(); ++j)
{
const auto input_index = op_inputs->operator[](j);
const auto is_const = runtime_context.isConstTensor(input_index);
// Note: we dont need to allocate for last node and for empty tensor
if (input_index == -1 or (is_const and not isTrainableWeights(opcode)) or
((index == last_node_pos - 1) and !is_const))
{
continue;
}
lifetimes[input_index] = {index, -1};
}

for (int32_t j = 0; j < op_outputs->size(); ++j)
{
const auto output_index = op_outputs->operator[](j);

lifetimes.at(output_index).second = index;
}
}

alloc_plan.assign(last_node_pos, std::vector<uint16_t>());
dealloc_plan.assign(last_node_pos, std::vector<uint16_t>());

for (const auto &item : lifetimes)
{
if (item.second.first != -1)
alloc_plan[item.second.first].push_back(item.first);
if (item.second.second != -1)
dealloc_plan[item.second.second].push_back(item.first);
}

return Ok;
}
Loading