Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[uTVM][AOT] Adding workspace byte alignment #8019

Merged
merged 5 commits into from
May 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/tvm/runtime/crt/error_codes.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ typedef enum {
kTvmErrorPlatformShutdown = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 2),
kTvmErrorPlatformNoMemory = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 3),
kTvmErrorPlatformTimerBadState = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 4),
kTvmErrorPlatformStackAllocBadFree = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryPlatform, 5),

// Common error codes returned from generated functions.
kTvmErrorGeneratedInvalidStorageId = DEFINE_TVM_CRT_ERROR(kTvmErrorCategoryGenerated, 0),
Expand Down
49 changes: 48 additions & 1 deletion include/tvm/runtime/crt/stack_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,61 @@ typedef struct {
size_t workspace_size; // Total number of bytes in the workspace
} tvm_workspace_t;

/*!
* \brief Initialize the stack-based memory manager
*
* \param tvm_runtime_workspace The tvm_workspace_t struct containing state
* \param g_aot_memory The memory buffer used to allocate within
* \param workspace_size The total size of the workspace buffer workspace
*/
tvm_crt_error_t StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace,
uint8_t* g_aot_memory, size_t workspace_size);

/*!
* \brief The intended user-facing function to allocate within the buffer. It wraps
* StackMemoryManager_Allocate_Body enable and disable the LIFO check that is useful for debugging
* the AoT codegen.
*
* \param tvm_runtime_workspace The tvm_workspace_t struct containing state
* \param nbytes The number of bytes required for the allocation
* \param current_alloc The pointer-to-pointer to be populated with the allocated address
*/
tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
void**);
void** current_alloc);

/*!
* \brief The internal function that accepts allocate inputs and an extra byte to say to enable the
* LIFO check that is useful in debugging for debugging the AoT codegen.
*
* \param tvm_runtime_workspace The tvm_workspace_t struct containing state
* \param nbytes The number of bytes required for the allocation
* \param current_alloc The pointer-to-pointer to be populated with the allocated address
* \param do_lifo_check This being non-zero indicates to perform a check LIFO pattern Allocs/Frees
*/
tvm_crt_error_t StackMemoryManager_Allocate_Body(tvm_workspace_t* tvm_runtime_workspace,
int32_t nbytes, void** current_alloc,
uint8_t do_lifo_check);

/*!
* \brief The intended user-facing function to free the tensor within the buffer. It wraps
* StackMemoryManager_Free_Body enable and disable the stack allocator
*
* \param tvm_runtime_workspace The tvm_workspace_t struct containing state
* \param ptr The base pointer of the tensor to be free'd
*/
tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr);

/*!
* \brief The internal function that accepts free inputs and an extra byte to say to enable the LIFO
* check that is useful in debugging for debugging the AoT codegen.
*
* \param tvm_runtime_workspace The tvm_workspace_t struct containing state
* \param ptr The base pointer of tensor to be free'd within the workspace buffer
* \param do_lifo_check This being non-zero indicates to perform a check LIFO pattern Allocs/Frees
*/
tvm_crt_error_t StackMemoryManager_Free_Body(tvm_workspace_t* tvm_runtime_workspace, void* ptr,
uint8_t do_lifo_check);

#ifdef __cplusplus
} // extern "C"
#endif
Expand Down
4 changes: 4 additions & 0 deletions include/tvm/runtime/device_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ constexpr int kTempAllocaAlignment = 128;
/*! \brief Maximum size that can be allocated on stack */
constexpr int kMaxStackAlloca = 1024;

/*! \brief Number of bytes each allocation must align to by default in the workspace buffer to
* service intermediate tensors */
constexpr int kDefaultWorkspaceAlignment = 1;

/*!
* \brief TVM Runtime Device API, abstracts the device
* specific interface for memory management.
Expand Down
5 changes: 4 additions & 1 deletion include/tvm/tir/analysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,11 @@ TVM_DLL size_t CalculateExprComplexity(const PrimExpr& expr);
/*!
* \brief Calculate the workspace size in bytes needed by the TIR allocates inside the TIR PrimFunc
* \param func The TIR PrimFunc for which the workspace size to be calculated
* \param workspace_byte_alignment The byte alignment required for each tensor allocated in this
* workspace
*/
TVM_DLL size_t CalculateWorkspaceBytes(const PrimFunc& func);
TVM_DLL size_t CalculateWorkspaceBytes(const PrimFunc& func,
const Integer& workspace_byte_alignment);

/*!
* \brief Detect the lowest common ancestor(LCA) of buffer access, including both high-level
Expand Down
6 changes: 4 additions & 2 deletions python/tvm/tir/analysis/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,21 +133,23 @@ def get_block_access_region(block, buffer_var_map):
return _ffi_api.get_block_access_region(block, buffer_var_map)


def calculate_workspace_bytes(func: PrimFunc):
def calculate_workspace_bytes(func: PrimFunc, workspace_byte_alignment: int):
"""Calculate the workspace size in bytes needed by the TIR allocates inside the TIR
PrimFunc.

Parameters
----------
func: tvm.tir.PrimFunc
The function to be detected.
workspace_byte_alignment : int
The byte alignment required for each tensor

Returns
-------
result : int
Workspace size in bytes.
"""
return _ffi_api.calculate_workspace_bytes(func)
return _ffi_api.calculate_workspace_bytes(func, workspace_byte_alignment)


def detect_buffer_access_lca(func: PrimFunc) -> Dict[Buffer, Stmt]:
Expand Down
8 changes: 6 additions & 2 deletions src/relay/backend/aot_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,9 @@ class AOTExecutorCodegen : public ExprVisitor {
* \param func The main function that contains calls to operator tir primitive functions
*/
void UpdateMainWorkspaceSize(const tir::PrimFunc& primfunc, const relay::Function& func) {
Integer workspace_size = CalculateWorkspaceBytes(primfunc);
auto workspace_byte_alignment = target_host_->GetAttr<Integer>("workspace-byte-alignment")
.value_or(tvm::runtime::kDefaultWorkspaceAlignment);
Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment);
// Populate FunctionInfo
auto fi_node = make_object<FunctionInfoNode>();
// Initialize all target workspaces to zero
Expand Down Expand Up @@ -318,7 +320,9 @@ class AOTExecutorCodegen : public ExprVisitor {
auto fi_node = make_object<FunctionInfoNode>();
for (const auto& kv : cfunc->funcs->functions) {
auto primfunc = Downcast<tir::PrimFunc>(kv.second);
Integer workspace_size = CalculateWorkspaceBytes(primfunc);
auto workspace_byte_alignment =
target_host_->GetAttr<Integer>("workspace-byte-alignment").value_or(16);
Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment);
Target primfunc_target = relay_target;
if (primfunc->attrs->dict.count("target")) {
primfunc_target = Downcast<Target>(primfunc->attrs->dict["target"]);
Expand Down
4 changes: 3 additions & 1 deletion src/relay/backend/graph_executor_codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,9 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
auto fi_node = make_object<FunctionInfoNode>();
for (const auto& kv : cfunc->funcs->functions) {
auto primfunc = Downcast<tir::PrimFunc>(kv.second);
Integer workspace_size = CalculateWorkspaceBytes(primfunc);
auto workspace_byte_alignment = relay_target->GetAttr<Integer>("workspace-byte-alignment")
.value_or(tvm::runtime::kDefaultWorkspaceAlignment);
Integer workspace_size = CalculateWorkspaceBytes(primfunc, workspace_byte_alignment);
Target primfunc_target = relay_target;
if (primfunc->attrs->dict.count("target")) {
primfunc_target = Downcast<Target>(primfunc->attrs->dict["target"]);
Expand Down
3 changes: 0 additions & 3 deletions src/runtime/crt/host/crt_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,6 @@
/*! \brief Maximum length of a PackedFunc function name. */
#define TVM_CRT_MAX_FUNCTION_NAME_LENGTH_BYTES 30

/*! \brief Enable checks to enforce the stack allocator with a FIFO ordering. */
#define TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK

// #define TVM_CRT_FRAMER_ENABLE_LOGS

#endif // TVM_RUNTIME_CRT_HOST_CRT_CONFIG_H_
57 changes: 38 additions & 19 deletions src/runtime/crt/memory/stack_allocator.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,10 @@
*/
// LINT_C_FILE
#include <tvm/runtime/crt/stack_allocator.h>
#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
#include <tvm/runtime/crt/logging.h>
#endif

tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
void** current_alloc) {
tvm_crt_error_t StackMemoryManager_Allocate_Body(tvm_workspace_t* tvm_runtime_workspace,
int32_t nbytes, void** current_alloc,
uint8_t do_lifo_check) {
// reserve bytes at the end of the allocation such that
// next_alloc % TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES == 0.
uint32_t offset_bytes =
Expand All @@ -34,30 +32,51 @@ tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspa
}
(*current_alloc) = tvm_runtime_workspace->next_alloc;
uint8_t* next_alloc = tvm_runtime_workspace->next_alloc + nbytes + offset_bytes;
#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
if (next_alloc + STACK_ALLOCATOR_TAG_SIZE_BYTES > workspace_end) {
return kTvmErrorPlatformNoMemory;
if (do_lifo_check != 0) {
if (next_alloc + STACK_ALLOCATOR_TAG_SIZE_BYTES > workspace_end) {
return kTvmErrorPlatformNoMemory;
}
const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
*((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
}
const uint32_t total_size = (nbytes + offset_bytes + STACK_ALLOCATOR_TAG_SIZE_BYTES);
*((uint32_t*)next_alloc) = total_size ^ STACK_ALLOCATOR_TAG;
next_alloc += STACK_ALLOCATOR_TAG_SIZE_BYTES;
#endif

tvm_runtime_workspace->next_alloc = next_alloc;
return kTvmErrorNoError;
}

tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_FIFO_CHECK
uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1);
uint32_t actual_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
uint32_t expected_size = tag ^ STACK_ALLOCATOR_TAG;
CHECK_EQ(expected_size, actual_size, "Deallocation not in FIFO ordering");
tvm_crt_error_t StackMemoryManager_Allocate(tvm_workspace_t* tvm_runtime_workspace, int32_t nbytes,
void** current_alloc) {
uint8_t do_lifo_check = 0;
#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK
do_lifo_check = 1;
#endif
tvm_runtime_workspace->next_alloc = ptr;
return StackMemoryManager_Allocate_Body(tvm_runtime_workspace, nbytes, current_alloc,
do_lifo_check);
}

tvm_crt_error_t StackMemoryManager_Free_Body(tvm_workspace_t* tvm_runtime_workspace, void* ptr,
uint8_t do_lifo_check) {
if (do_lifo_check != 0) {
uint32_t tag = *(((uint32_t*)tvm_runtime_workspace->next_alloc) - 1);
uint32_t actual_size = (tvm_runtime_workspace->next_alloc - (uint8_t*)ptr);
uint32_t expected_size = tag ^ STACK_ALLOCATOR_TAG;
if (expected_size != actual_size) {
return kTvmErrorPlatformStackAllocBadFree;
}
}
tvm_runtime_workspace->next_alloc = (uint8_t*)ptr;
return kTvmErrorNoError;
}

tvm_crt_error_t StackMemoryManager_Free(tvm_workspace_t* tvm_runtime_workspace, void* ptr) {
uint8_t do_lifo_check = 0;
#ifdef TVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK
do_lifo_check = 1;
#endif
return StackMemoryManager_Free_Body(tvm_runtime_workspace, ptr, do_lifo_check);
}

tvm_crt_error_t StackMemoryManager_Init(tvm_workspace_t* tvm_runtime_workspace,
uint8_t* g_aot_memory, size_t workspace_size) {
tvm_runtime_workspace->next_alloc = g_aot_memory;
Expand Down
1 change: 1 addition & 0 deletions src/target/target_kind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
.add_attr_option<String>("mcpu")
.add_attr_option<String>("march")
.add_attr_option<String>("executor")
.add_attr_option<Integer>("workspace-byte-alignment")
.set_default_keys({"cpu"});

TVM_REGISTER_TARGET_KIND("cuda", kDLGPU)
Expand Down
19 changes: 14 additions & 5 deletions src/tir/analysis/calculate_workspace.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
* \brief Calculate any intermediary memory required by PrimFuncs.
*/
#include <tvm/arith/analyzer.h>
#include <tvm/runtime/device_api.h>
#include <tvm/tir/analysis.h>
#include <tvm/tir/function.h>
#include <tvm/tir/stmt_functor.h>
Expand All @@ -33,10 +34,12 @@ class WorkspaceCalculator : public StmtExprVisitor {
public:
WorkspaceCalculator() = default;
size_t operator()(const PrimFunc& func);
size_t byte_alignment = tvm::runtime::kDefaultWorkspaceAlignment;

private:
void VisitStmt_(const AllocateNode* op) override;
size_t CalculateExtentsSize(const AllocateNode* op);
size_t GetByteAlignedSize(size_t non_aligned_size);
size_t current_size = 0;
size_t max_size = 0;
};
Expand All @@ -46,6 +49,10 @@ size_t WorkspaceCalculator::operator()(const PrimFunc& func) {
return this->max_size;
}

size_t WorkspaceCalculator::GetByteAlignedSize(size_t non_aligned_size) {
return ((non_aligned_size + byte_alignment - 1) / byte_alignment) * byte_alignment;
areusch marked this conversation as resolved.
Show resolved Hide resolved
}

size_t WorkspaceCalculator::CalculateExtentsSize(const AllocateNode* op) {
size_t element_size_bytes = op->dtype.bytes();
size_t num_elements = 1;
Expand All @@ -57,7 +64,7 @@ size_t WorkspaceCalculator::CalculateExtentsSize(const AllocateNode* op) {
num_elements = 0;
}
}
return num_elements * element_size_bytes;
return GetByteAlignedSize(num_elements * element_size_bytes);
}

void WorkspaceCalculator::VisitStmt_(const AllocateNode* op) {
Expand All @@ -70,14 +77,16 @@ void WorkspaceCalculator::VisitStmt_(const AllocateNode* op) {
current_size -= size;
}

size_t CalculateWorkspaceBytes(const PrimFunc& func) {
size_t CalculateWorkspaceBytes(const PrimFunc& func, const Integer& workspace_byte_alignment) {
WorkspaceCalculator wc;
wc.byte_alignment = workspace_byte_alignment->value;
return wc(func);
}

TVM_REGISTER_GLOBAL("tir.analysis.calculate_workspace_bytes").set_body_typed([](PrimFunc func) {
return static_cast<int>(CalculateWorkspaceBytes(func));
});
TVM_REGISTER_GLOBAL("tir.analysis.calculate_workspace_bytes")
.set_body_typed([](PrimFunc func, Integer workspace_byte_alignment) {
return static_cast<int>(CalculateWorkspaceBytes(func, workspace_byte_alignment));
});

} // namespace tir
} // namespace tvm
Loading