Skip to content

Commit

Permalink
[HMX] Support different spatial layouts apache#15
Browse files Browse the repository at this point in the history
  • Loading branch information
supersat authored and csullivan committed Feb 21, 2023
1 parent 105930f commit ae49fe4
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 25 deletions.
26 changes: 12 additions & 14 deletions src/runtime/hexagon/hexagon_hmx.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,19 +66,19 @@ extern "C" void device_api_hexagon_hmx_matmul_u8(void* in_a_t, void* in_b_t, int
}

extern "C" void device_api_hexagon_hmx_conv2d_u8(void* in_a_t, void* in_a_next_t, void* in_w_t,
int32_t height, int32_t width,
int32_t a_elem_offset, int32_t a_next_elem_offset,
int32_t w_elem_offset) {
int32_t layout_enc, int32_t enc_filter_size,
int32_t channels, int32_t a_elem_offset,
int32_t a_next_elem_offset,
int32_t w_elem_offset,
int32_t w_size) {
// RT_TRACE_PUT_REC(0, HMX_CONV_BEGIN);
unsigned int aRs, aRt, wRs, wRt;

// Rs, DM:
// 31-11: activation crouton addr
// 10-5: filter/weight size
// 4-0: chan start
aRs = (height - 1) << 8;
aRs |= (width - 1) << 5;

aRs = enc_filter_size;

unsigned int a_addr = reinterpret_cast<unsigned int>(reinterpret_cast<uint8_t*>(in_a_t) + a_elem_offset);
std::bitset<32> a_addr_b(a_addr);
Expand All @@ -94,14 +94,11 @@ extern "C" void device_api_hexagon_hmx_conv2d_u8(void* in_a_t, void* in_a_next_t
// 4-0: chan end
aRt = a_addr_offset & ~0x7ff;
std::bitset<32> a_addr_offset_b(a_addr_offset);
aRt |= 0b111000 << 5;
aRt |= 31; // TODO(nverke): MAKE channel - 1
aRt |= layout_enc;
aRt |= channels - 1;

wRs = reinterpret_cast<unsigned int>(reinterpret_cast<uint8_t*>(in_w_t) + w_elem_offset);
wRt = 32 * 32; // weight size
wRt *= width;
wRt *= height;
wRt -= 1; // weight size
wRt = w_size;

// FARF(LOW, "CallExtern Q6_activation_ub_mxmem_RR_deep_cm(%08x, %08x)", aRs, aRt);
// FARF(LOW, "CallExtern Q6_weight_b_mxmem_RR_deep(%08x, %08x)", wRs, wRt);
Expand All @@ -124,12 +121,13 @@ extern "C" void device_api_hexagon_hmx_matmul_cvt_u8(void* out_o_t, int32_t o_el
Q6_mxmem_AR_after_cm_sat_ub(out_ptr, cRt);
}

extern "C" void device_api_hexagon_hmx_conv2d_cvt_after_u8(void* out_o_t, int32_t elem_offset) {
extern "C" void device_api_hexagon_hmx_conv2d_cvt_after_u8(void* out_o_t, int32_t elem_offset,
int32_t layout_enc) {
// RT_TRACE_PUT_REC(0, HMX_CVT_BEGIN);
// TODO(HWE): Partial stores need extra support in the form of shifting.
unsigned int cRt;
auto out_ptr = reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(out_o_t) + elem_offset);
cRt = 0b111000 << 5; // Spatial coordinates: HHHWWW
cRt = layout_enc;
// FARF(LOW, " Q6_mxmem_AR_after_cm_ub(%08x, %08x)\n", out_ptr, cRt);
Q6_mxmem_AR_after_cm_sat_ub(out_ptr, cRt);
// Q6_mxmem_AR_after_cm_ub(out_ptr, cRt);
Expand Down
2 changes: 2 additions & 0 deletions tests/python/contrib/test_hexagon/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def compose_one_convolution_quantized_separate_layout_transforms(
input_tile_offset,
workload_padding,
strides,
tile_shape_hw,
mem_scope,
allow_deep_mode,
):
Expand All @@ -136,6 +137,7 @@ def compose_one_convolution_quantized_separate_layout_transforms(
tile_offset_A,
workload_padding=workload_padding,
strides=strides,
tile_shape_hw=tile_shape_hw,
mem_scope=mem_scope,
allow_deep_mode=allow_deep_mode
)
Expand Down
57 changes: 48 additions & 9 deletions tests/python/contrib/test_hexagon/hmx_qnn_conv2d_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,34 @@
from tvm import tir
import tvm.script
from tvm.script import tir as T
from math import log2

def get_raw_simple_spatial_mask(TH, TW):
assert (TH * TW) == 64, (
f"Tile spatial size {TH} x {TW} must equal 64."
)
w_offset = 6 - int(log2(TH))
return 0b111111 ^ ((1 << w_offset) - 1)

def get_enc_filter_size_from_raw_mask(FH, FW, mask):
assert(FH > 0 and FH <= 8)
assert(FW > 0 and FW <= 8)
FW -= 1
FH -= 1
result = 0
for i in range(6):
if (mask & 1):
bit = (FH & 1)
FH >>= 1
else:
bit = (FW & 1)
FW >>= 1
result |= bit << i
mask >>= 1
return result

def convert_spatial_encoding_cm(encoding):
return encoding << 5

def make_hexagon_conv2d_quantized_no_layout_transform_nhwc(
input_shape,
Expand Down Expand Up @@ -88,6 +116,11 @@ def make_hexagon_conv2d_quantized_no_layout_transform_nhwc(
TH, TW = tile_shape_hw
tile_shape = [1, TH, TW, 32]

spatial_mask = get_raw_simple_spatial_mask(TH, TW)
spatial_encoding = convert_spatial_encoding_cm(spatial_mask)
filter_size_encoding = get_enc_filter_size_from_raw_mask(FH, FW, spatial_mask)
filter_size_encoding = convert_spatial_encoding_cm(filter_size_encoding)

AC_pad_h_low, AC_pad_w_low = input_tile_offset
assert 0 <= AC_pad_h_low < TH, (
f"Height offset {AC_pad_h_low} must be on range [0, {TH}). "
Expand Down Expand Up @@ -150,9 +183,10 @@ def make_hexagon_conv2d_quantized_no_layout_transform_nhwc(
output_tile_offset = (BC_pad_h_low, BC_pad_w_low, BC_pad_h_high, BC_pad_w_high)

CI = 32
WCI = tir.min(tir.ceildiv(C, 4) * 4, CI)
CO = tir.ceildiv(C, 32)
CII = 4
CIO = tir.ceildiv(CI, CII)
CIO = tir.ceildiv(WCI, CII)
AHO = tir.ceildiv(AC_pad_h_low + AH, TH)
AWO = tir.ceildiv(AC_pad_w_low + AW, TW)

Expand All @@ -178,6 +212,7 @@ def make_hexagon_conv2d_quantized_no_layout_transform_nhwc(
# tir.IndexMap.AXIS_SEPARATOR,
*filter_chunk_shape,
]
weight_size = (FH * FW * CIO * KI * CII) - 1
transformed_output_shape = [
1,
BHO,
Expand All @@ -193,7 +228,8 @@ def make_hexagon_conv2d_quantized_no_layout_transform_nhwc(
N_offset_w = (AC_pad_w_low - PW_low + (FW - 1)) // TW

# The physical hardware allows for 2 such accumulators, which limits
# the size of the filter to FW<=8.
# the size of the filter to FW<=TW.
assert(FW <= TW)
NA = 1 + tir.ceildiv(FW, TW)

accumulator_shape = [
Expand Down Expand Up @@ -297,7 +333,7 @@ def filter_OIHW_to_tiled(F_handle: T.handle, FC_handle: T.handle):
with T.block("filter_OIHW_to_tiled"):
for ko, co, fh, fw, cio, ki, cii in T.grid(KO, CO, FH, FW, CIO, KI, CII):
ci = CII * cio + cii
c = CI * co + ci
c = WCI * co + ci
k = KI * ko + ki
FC[ko, co, fh, fw, cio, ki, cii] = T.if_then_else(
0 <= k and k < K and 0 <= c and c < C,
Expand Down Expand Up @@ -532,7 +568,7 @@ def main(
0:32,
],
zero_crouton[0:TH, 0:TW, 0:32],
FC[vko, vco, 0:FH, 0:FW, 0:8, 0:32, 0:4],
FC[vko, vco, 0:FH, 0:FW, 0:WCI, 0:32, 0:4],
)
# Most reads will occur from AC_croutons[0], but
# overflow in the +h direction will read from
Expand Down Expand Up @@ -565,11 +601,13 @@ def main(
FC[vko, vco, 0, 0, 0, 0, 0],
dtype="handle",
),
FH,
FW,
spatial_encoding,
filter_size_encoding,
WCI,
AC.elem_offset,
AC.elem_offset,
FC.elem_offset,
weight_size,
dtype="handle",
)
# if 0 <= awo and awo < AWO:
Expand All @@ -587,8 +625,8 @@ def main(
bho,
T.max(awoplus + AWO_START - N_offset_w, 0) : T.min(T.max(awoplus + AWO_START + 1 +N_offset_w, 0), BWO),
ko,
0:8,
0:8,
0:TH,
0:TW,
0:32,
],
acc_junkyard[0, 0, 0, 0]
Expand All @@ -606,6 +644,7 @@ def main(
dtype="handle",
),
BC.elem_offset,
spatial_encoding,
dtype="handle"
)
else:
Expand All @@ -614,6 +653,7 @@ def main(
"device_api_hexagon_hmx_conv2d_cvt_after_u8",
T.address_of(acc_junkyard[0, 0, 0, 0], dtype="handle"),
acc_junkyard.elem_offset,
spatial_encoding,
dtype="handle"
)
else:
Expand Down Expand Up @@ -661,7 +701,6 @@ def main(
dtype="handle",
)


return mod, {
"input_tile_offset": input_tile_offset,
"input_shape": input_shape,
Expand Down
6 changes: 6 additions & 0 deletions tests/python/contrib/test_hexagon/infrastructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,12 @@ class Conv2dHMXTestingBase:

allow_deep_mode = tvm.testing.parameter(True)

tile_shape_hw = tvm.testing.parameter(
by_dict={
"8x8": (8, 8),
}
)


quantization_parameters = tvm.testing.parameter(
by_dict = {
Expand Down
26 changes: 24 additions & 2 deletions tests/python/contrib/test_hexagon/test_qnn_conv2d_hmx.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ def schedule_func(ir_module, mem_scope, strides, filter_hw):
sch.annotate(bho, "software_pipeline_async_stages", [0, 2])
return sch.mod



class QuantizedConv2dHMXBase(Conv2dHMXTestingBase):
def evaluate(
self, hexagon_session, compose_one_convolution_quantized_separate_layout_transforms, workload_padding, input_activation, input_weights, input_bias, quantization_parameters, reference_output_quantized, mem_scope, use_microkernel=False
Expand Down Expand Up @@ -153,3 +151,27 @@ def test_microkernel_benchmark_one_call(
pytest.skip("Deep mode is not enabled in the microkernel")

self.evaluate(hexagon_session, compose_one_convolution_quantized_separate_layout_transforms, workload_padding, input_activation, input_weights, input_bias, quantization_parameters, reference_output_quantized, mem_scope, use_microkernel=True)

class TestQuantizedConv2dCustomTileShapes(TestQuantizedConv2d):
mem_scope = tvm.testing.parameter("global.vtcm")
input_channels = tvm.testing.parameter(64)
# output_channels = tvm.testing.parameter(256)
filter_hw = tvm.testing.parameter(3)
input_hw = tvm.testing.parameter(56)
input_tile_offset = tvm.testing.parameter(
# (0, 0),
(1, 1),
)
workload_padding = tvm.testing.parameter(
by_dict={
# "pad-0": (0, 0, 0, 0),
"pad-1": (1, 1, 1, 1),
}
)
tile_shape_hw = tvm.testing.parameter(
by_dict={
"8x8": (8, 8),
"4x16": (4, 16),
"16x4": (16, 4),
}
)

0 comments on commit ae49fe4

Please sign in to comment.