Skip to content

Commit

Permalink
[microNPU] Refactor base address determination to codegen
Browse files Browse the repository at this point in the history
* Renaming runtime_allocate to be scratch again.
* Docstring adjustments.

Change-Id: Ife8baf97f3dc9348718bd03e62549169a466fc34
  • Loading branch information
manupak committed Jan 19, 2022
1 parent 7bc6c0d commit 8a71a57
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 36 deletions.
34 changes: 18 additions & 16 deletions python/tvm/relay/backend/contrib/ethosu/tir_to_cs_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ class BufferType(Enum):

constant = auto()
input_or_output = auto()
runtime_allocate = auto()
scratch = auto()
input = auto()
output = auto()
shram = auto()


_REGION_MAP = {
BufferType.constant: 0,
BufferType.runtime_allocate: 1,
BufferType.scratch: 1,
BufferType.input: 3,
BufferType.output: 4,
BufferType.shram: int((1 << 8) | (3 << 0)),
Expand Down Expand Up @@ -103,23 +103,23 @@ def translate(tir_module, params):
An hex string of the bytes that includes concat'd
encoded weights, encoded biases and scales.
base_addresses : List[util.BaseAddress]
base addresses
base addresses to be used by the driver
"""

buffer_info = extract_buffer_info(tir_module, params)
call_extern_list = extract_call_extern_list(tir_module)
_npu_ops = list()
for call_extern in call_extern_list:
_npu_ops.append(translate_ethosu_tir_call_extern(call_extern))
_npu_ops, constant_data, runtime_allocation_size = assign_addresses(buffer_info, _npu_ops)
_npu_ops, constant_data, scratch_size = assign_addresses(buffer_info, _npu_ops)
base_addresses = extract_param_base_addresses(tir_module, buffer_info)
if runtime_allocation_size > 0:
if scratch_size > 0:
base_addresses.append(
util.BaseAddress(
"runtime_allocation",
"scratch",
None,
_REGION_MAP[BufferType.runtime_allocate],
runtime_allocation_size,
_REGION_MAP[BufferType.scratch],
scratch_size,
True,
)
)
Expand Down Expand Up @@ -248,7 +248,7 @@ def populate_allocate_buffer_info(stmt):
if storage_scope == "local":
buffer_type = BufferType.shram
else:
buffer_type = BufferType.runtime_allocate
buffer_type = BufferType.scratch
buffer_info[allocate.buffer_var] = BufferInfo(
None,
allocate.extents,
Expand Down Expand Up @@ -280,7 +280,7 @@ def assign_addresses(buffer_info, npu_ops):
A list of Vela NpuOps with addesses within scratch and constant buffers
constant_tensor : NDArray
A unified constant data array of uint8 as the constant buffer
runtime_allocation_size : int
scratch_size : int
The size of the scratch tensor.
"""

Expand Down Expand Up @@ -327,7 +327,7 @@ def classify_io(buffer):

raise ValueError(f"Unused IO : {buffer} in tir module.")

runtime_allocation_size = 0
scratch_size = 0
constant_hex_data = []
total_constant_len = 0
buffer_addresses = dict()
Expand All @@ -352,7 +352,9 @@ def classify_io(buffer):
assert buffer_type in (BufferType.input, BufferType.output)
address = 0
buffer_addresses[_buffer] = (address, buffer_type)
buffer_info[_buffer] = BufferInfo(None, info.dtype, info.dtype, buffer_type)
buffer_info[_buffer] = BufferInfo(
values=None, shape=info.dtype, dtype=info.dtype, btype=buffer_type
)
elif info.btype == BufferType.shram:
accl_config = util.get_accelerator_config()
arch_config = get_accelerator_arch_config(accl_config)
Expand All @@ -363,9 +365,9 @@ def classify_io(buffer):
size_in_bytes = int(dtype_bytes * np.prod(list(info.shape)))
# Every memory address the NPU access have to be 16 byte aligned
size_in_bytes = util.round_up(size_in_bytes, 16)
assert info.btype == BufferType.runtime_allocate
address = runtime_allocation_size
runtime_allocation_size += size_in_bytes
assert info.btype == BufferType.scratch
address = scratch_size
scratch_size += size_in_bytes
buffer_addresses[_buffer] = (address, info.btype)

for npu_op in npu_ops:
Expand All @@ -382,7 +384,7 @@ def classify_io(buffer):
return (
npu_ops,
constant_data,
runtime_allocation_size,
scratch_size,
)


Expand Down
3 changes: 2 additions & 1 deletion src/relay/backend/contrib/ethosu/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ namespace ethosu {

/*!
* \brief Base addresses are input pointers to
* the driver that get accessed by produced
* the driver that get accessed by the command stream
* using offsets to read/write data.
*/
struct BaseAddressNode : public Object {
/*! \brief The identifier, usually it the param name of the PrimFunc that gets lowered */
Expand Down
36 changes: 17 additions & 19 deletions tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,12 +230,12 @@ def test_buffer_info_extraction():
"ethosu_conv2d_2": (
[1024],
"uint8",
tir_to_cs_translator.BufferType.runtime_allocate,
tir_to_cs_translator.BufferType.scratch,
),
"ethosu_conv2d_3": (
[2048],
"uint8",
tir_to_cs_translator.BufferType.runtime_allocate,
tir_to_cs_translator.BufferType.scratch,
),
},
},
Expand Down Expand Up @@ -776,15 +776,15 @@ def _check_buffer(address, region, length, buffer_var):
original tir buffers.
- If its constant, this will check
the slice in the constant tensor has the values.
- If its runtime_allocation, this will check
the slice is within runtime_allocation and does not have conflicts
with other runtime_allocation tensors.
- If its scratch, this will check
the slice is within scratch and does not have conflicts
with other scratch tensors.
- If its input/output, this will check the
address is zero
"""
inverse_region_map = {
0: tir_to_cs_translator.BufferType.constant,
1: tir_to_cs_translator.BufferType.runtime_allocate,
1: tir_to_cs_translator.BufferType.scratch,
3: tir_to_cs_translator.BufferType.input,
4: tir_to_cs_translator.BufferType.output,
}
Expand All @@ -804,21 +804,19 @@ def _check_buffer(address, region, length, buffer_var):
constant_tensor_read_mask[address : address + length] = np.ones(
length, dtype=buffer_dtype
)
elif buffer_type == tir_to_cs_translator.BufferType.runtime_allocate:
elif buffer_type == tir_to_cs_translator.BufferType.scratch:
shape = list(buffer_info[buffer_var].shape)
assert length == np.prod(shape)
assert address < runtime_allocation_size
assert address < scratch_size

size_in_bytes = int(np.prod(shape)) * dtype_bytes
# Every buffer is adjusted to align to 16 bytes
size_in_bytes = util.round_up(size_in_bytes, 16)
assert address + size_in_bytes <= runtime_allocation_size
# The runtime_allocation area should not be used by anyother buffer
assert not runtime_allocation_mask[address : address + size_in_bytes].any()
# The runtime_allocation area is marked as used
runtime_allocation_mask[address : address + size_in_bytes] = np.ones(
size_in_bytes, dtype="uint8"
)
assert address + size_in_bytes <= scratch_size
# The scratch area should not be used by any other buffer
assert not scratch_mask[address : address + size_in_bytes].any()
# The scratch area is marked as used
scratch_mask[address : address + size_in_bytes] = np.ones(size_in_bytes, dtype="uint8")
elif buffer_type == tir_to_cs_translator.BufferType.input:
assert address == 0
else:
Expand Down Expand Up @@ -898,13 +896,13 @@ def check_buffer(address, region, length, buffer_var):
(
_npu_ops,
constant_hex_string,
runtime_allocation_size,
scratch_size,
) = tir_to_cs_translator.assign_addresses(buffer_info, _npu_ops)
runtime_allocation_mask = np.zeros(runtime_allocation_size, dtype="uint8")
scratch_mask = np.zeros(scratch_size, dtype="uint8")
constant_tensor_read_mask = np.zeros(len(constant_hex_string) // 2, dtype="uint8")
verify(_npu_ops)
# This will be only 1 if all allocated runtime_allocation is used.
assert np.prod(runtime_allocation_mask) == 1
# This will be only 1 if all allocated scratch is used.
assert np.prod(scratch_mask) == 1
# This will be only 1 if all constant tensors is read at least once.
assert np.prod(constant_tensor_read_mask) == 1

Expand Down

0 comments on commit 8a71a57

Please sign in to comment.