Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[microNPU] Tweak a layout transform matrix #10763

Merged
merged 3 commits into from
Mar 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 50 additions & 41 deletions python/tvm/contrib/ethosu/cascader/device_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,23 @@ def is_partkernel(

return part_kernel_first_utilization > depth_first_utilization or ifm_channels <= 8

def _get_input_banks(self, input_block_shape, input_bytewidth):
input_bytes = input_block_shape.area() * self._align(
input_block_shape.depth * input_bytewidth, 8
)
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
input_banks = _round_up(input_banks, self._input_granularity)

return input_banks

def _get_accumulator_banks(self, output_block_shape, acc_bytewidth, depth):
acc_depth = _round_up(min(output_block_shape.depth, depth), 8)
acc_bytes = output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])

return acc_banks

def get_elementwise_block_config(
self,
ifm_propagator: Propagator,
Expand Down Expand Up @@ -533,16 +550,9 @@ def get_elementwise_block_config(
input2_block.round_up(self._input_micro_block)

# Banks required for input block
input_bytes = input_block.area() * self._align(input_block.depth * input_bytewidth, 8)
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
input_banks = _round_up(input_banks, self._input_granularity)

input_banks = self._get_input_banks(input_block, input_bytewidth)
# Banks required for input2 block
input2_bytes = input2_block.area() * self._align(
input2_block.depth * input_bytewidth, 8
)
input2_banks = _round_up_div(input2_bytes, self._bank_size_bytes) * 2
input2_banks = _round_up(input2_banks, self._input_granularity)
input2_banks = self._get_input_banks(input2_block, input_bytewidth)

# Check whether or not both IFMs fit into SHRAM
if (input_banks + input2_banks) <= banks_available:
Expand All @@ -561,6 +571,29 @@ def get_elementwise_block_config(

return block_config

def _get_subkernel_propagator(
self, op_attrs, ifm_propagator, input_layout, output_layout, depth
):
op_type = op_attrs.get("op")
stride_h = int(op_attrs.get("stride_h", 1))
stride_w = int(op_attrs.get("stride_w", 1))
transform = ifm_propagator.transform

if input_layout == "NHCWB16":
transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
transform[3][-1] = min(transform[3][-1], self._subkernel_limits[1] - stride_w)
else:
transform[1][-1] = min(transform[1][-1], self._subkernel_limits[0] - stride_h)
transform[2][-1] = min(transform[2][-1], self._subkernel_limits[1] - stride_w)

if op_type in ("ethosu_pooling", "ethosu_depthwise_conv2d"):
if output_layout == "NHCWB16" and input_layout == "NHWC":
transform[3][-1] = depth
elif output_layout == "NHCWB16" and input_layout == "NHCWB16":
transform[2][-1] = depth // 16

return Propagator(transform, ifm_propagator.offset)

def get_valid_block_configs(
self,
ifm_propagator: Propagator,
Expand Down Expand Up @@ -612,33 +645,13 @@ def get_valid_block_configs(
op_type = op_attrs.get("op")
op_str = op_attrs.get("op_str")
activation = op_attrs.get("activation", "NONE")
stride_h = int(op_attrs.get("stride_h", 1))
stride_w = int(op_attrs.get("stride_w", 1))
upscaling_factor = 1 if op_attrs.get("upscale", "NONE") == "NONE" else 2

subkernel_transform = ifm_propagator.transform
if output_layout == "NHCWB16":
output_shape = _Shape([1, ofm_shape[1], ofm_shape[3], ofm_channels])
else:
output_shape = _Shape(ofm_shape)

if input_layout == "NHCWB16":
subkernel_transform[1][-1] = min(
subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
)
subkernel_transform[3][-1] = min(
subkernel_transform[3][-1], self._subkernel_limits[1] - stride_w
)
else:
subkernel_transform[1][-1] = min(
subkernel_transform[1][-1], self._subkernel_limits[0] - stride_h
)
subkernel_transform[2][-1] = min(
subkernel_transform[2][-1], self._subkernel_limits[1] - stride_w
)

subkernel_propagator = Propagator(subkernel_transform, ifm_propagator.offset)

# Define search space
max_height = min(output_shape.height, self._max_block_shape.height)
min_height = max(self._micro_block.height, upscaling_factor)
Expand All @@ -655,7 +668,7 @@ def get_valid_block_configs(
if activation == "LUT" and not self._lut_reserved:
banks_available -= 2

# Input block depth has additional limitations for Operators that require full input depth
# Input block depth has additional limitations for operators that require full input depth
input_block_depth = 0
is_partkernel = self.is_partkernel(op_type, ifm_channels, ifm_dtype, kernel_h * kernel_w)
if op_type == "ethosu_conv2d":
Expand All @@ -669,6 +682,10 @@ def get_valid_block_configs(
# Block depth has to be less than full depth or a multiple of the split depth
continue

subkernel_propagator = self._get_subkernel_propagator(
op_attrs, ifm_propagator, input_layout, output_layout, depth
)

for width in range(min_width, max_width + min_width, min_width):
for height in range(min_height, max_height + min_height, min_height):
if output_layout == "NHCWB16":
Expand Down Expand Up @@ -709,19 +726,11 @@ def get_valid_block_configs(
input_block_shape.depth = input_block_depth

# Banks required for input block
input_bytes = input_block_shape.area() * self._align(
input_block_shape.depth * input_bytewidth, 8
)
input_banks = _round_up_div(input_bytes, self._bank_size_bytes) * 2
input_banks = _round_up(input_banks, self._input_granularity)

input_banks = self._get_input_banks(input_block_shape, input_bytewidth)
# Banks required for accumulation
acc_depth = _round_up(min(output_block_shape.depth, ofm_channels), 8)
acc_bytes = (
output_block_shape.area() * self._align(acc_depth, 8) * acc_bytewidth
acc_banks = self._get_accumulator_banks(
output_block_shape, acc_bytewidth, depth
)
acc_banks = _round_up_div(acc_bytes, self._bank_size_bytes) * 2
acc_banks = _round_up(acc_banks, self._accumulator_granularity[acc_bytewidth])

if (input_banks + acc_banks) <= banks_available:
output_cycles = self._get_output_cycles(
Expand Down
18 changes: 3 additions & 15 deletions python/tvm/relay/backend/contrib/ethosu/te/binary_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher

from .dma import dma_ofm_compute, dma_ifm_compute
from .common import get_layout_transform_matrices


def binary_elementwise_compute(
Expand Down Expand Up @@ -196,21 +197,8 @@ def binary_elementwise_compute(
attrs=binary_elementwise_attrs,
)

nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ifm_channels))

ifm_matrix = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
Expand Down
60 changes: 60 additions & 0 deletions python/tvm/relay/backend/contrib/ethosu/te/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""Common methods for the NPU tensor expressions"""

from typing import Tuple, List


def get_layout_transform_matrices(ofm_channels: int) -> Tuple[List[List[float]], List[List[float]]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

"""Get the NHWC->NHCWB16 and NHCWB16->NHWC layout transform matrices.
For information about the supported layouts see https://developer.arm.com/documentation/102420/
0200/Functional-description/Control-and-data-flow/Supported-memory-formats-for-feature-maps

Parameters
----------
ofm_channels : int
The number of output channels in a NHWC layout

Returns
-------
nhwc_to_nhcwb16, nhcwb16_to_nhwc : Tuple[List[List[float]], List[List[float]]]
The layout transformation matrices
"""

# The value of the last dimension (B16) is always 16.
nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs : might worth putting a comment to indicate that b16 axis is always going to be of fixed size of 16.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

[0, 0, 0, 0, 1],
]

# When we convert from NHWC to NHCWB16, the new C value is given by
# (ofm_channels - 1) // 16 + 1, which is a lossy operation, so we need to use
# the actual value of channels in the transform matrix to accurately recover
# the C in NHWC when we convert from NHCWB16 to NHWC.
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, ofm_channels],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docs : lets state that the conversion of nhwc_to_nhcwb16 is lossy (because b16 axis is fixed to 16). Therefore, to recover the original "c" of "nhwc", we need to use the original channels in the transform matrix.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

[0, 0, 0, 0, 0, 1],
]

return nhwc_to_nhcwb16, nhcwb16_to_nhwc
18 changes: 3 additions & 15 deletions python/tvm/relay/backend/contrib/ethosu/te/convolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher

from .dma import dma_ofm_compute, dma_ifm_compute
from .common import get_layout_transform_matrices


def conv2d_compute(
Expand Down Expand Up @@ -175,21 +176,8 @@ def conv2d_compute(
attrs=conv2d_attrs,
)

nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(ofm_channels)

ifm_matrix = [
[1, 0, 0, 0, 0],
[0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
Expand Down
18 changes: 3 additions & 15 deletions python/tvm/relay/backend/contrib/ethosu/te/depthwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher

from .dma import dma_ofm_compute, dma_ifm_compute
from .common import get_layout_transform_matrices


def depthwise_conv2d_compute(
Expand Down Expand Up @@ -169,21 +170,8 @@ def depthwise_conv2d_compute(
attrs=depthwise_conv2d_attrs,
)

nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(channels)

ifm_matrix = [
[1, 0, 0, 0, 0],
[0, stride_h, 0, 0, (dilated_kernel_h - stride_h)],
Expand Down
18 changes: 3 additions & 15 deletions python/tvm/relay/backend/contrib/ethosu/te/pooling.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher

from .dma import dma_ofm_compute, dma_ifm_compute
from .common import get_layout_transform_matrices


def pooling_compute(
Expand Down Expand Up @@ -157,21 +158,8 @@ def pooling_compute(
attrs=pooling_attrs,
)

nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels))

ifm_matrix = [
[1, 0, 0, 0, 0],
[0, stride_h, 0, 0, (pool_shape_h - stride_h)],
Expand Down
18 changes: 3 additions & 15 deletions python/tvm/relay/backend/contrib/ethosu/te/unary_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from tvm import te
from tvm.contrib.ethosu.cascader import TESubgraph, EthosuPart, Propagator, register_matcher
from .dma import dma_ofm_compute, dma_ifm_compute
from .common import get_layout_transform_matrices


def unary_elementwise_compute(
Expand Down Expand Up @@ -129,21 +130,8 @@ def clz_imp(inp):
attrs=unary_elementwise_attrs,
)

nhwc_to_nhcwb16 = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 1 / 16, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 0, 16],
[0, 0, 0, 0, 1],
]
nhcwb16_to_nhwc = [
[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0],
[0, 0, 16, 0, 1, -16],
[0, 0, 0, 0, 0, 1],
]
nhwc_to_nhcwb16, nhcwb16_to_nhwc = get_layout_transform_matrices(int(ofm_channels))

ifm_matrix = [
[1, 0, 0, 0, 0],
[0, 1, 0, 0, 0],
Expand Down
2 changes: 2 additions & 0 deletions src/contrib/ethosu/cascader/block_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ void BlockConfigNode::VisitAttrs(AttrVisitor* v) {
v->Visit("_input_shape", &tmp_arr);
tmp_arr = make_array(output_shape_);
v->Visit("_output_shape", &tmp_arr);
v->Visit("_compute_cycles", &compute_cycles_);
v->Visit("_output_cycles", &output_cycles_);
}

BlockConfig::BlockConfig(const std::vector<int>& input_shape, const std::vector<int>& output_shape,
Expand Down
Loading