[HMX] Support different spatial layouts apache#15

adstraw · Feb 21, 2023 · ae49fe4 · ae49fe4
1 parent 105930f
commit ae49fe4
Show file tree

Hide file tree

Showing 5 changed files with 92 additions and 25 deletions.
diff --git a/src/runtime/hexagon/hexagon_hmx.cc b/src/runtime/hexagon/hexagon_hmx.cc
@@ -66,19 +66,19 @@ extern "C" void device_api_hexagon_hmx_matmul_u8(void* in_a_t, void* in_b_t, int
 }
 
 extern "C" void device_api_hexagon_hmx_conv2d_u8(void* in_a_t, void* in_a_next_t, void* in_w_t,
-                                                 int32_t height, int32_t width,
-                                                 int32_t a_elem_offset, int32_t a_next_elem_offset,
-                                                 int32_t w_elem_offset) {
+                                                 int32_t layout_enc, int32_t enc_filter_size,
+                                                 int32_t channels, int32_t a_elem_offset,
+                                                 int32_t a_next_elem_offset,
+                                                 int32_t w_elem_offset,
+                                                 int32_t w_size) {
   // RT_TRACE_PUT_REC(0, HMX_CONV_BEGIN);
   unsigned int aRs, aRt, wRs, wRt;
 
   // Rs, DM:
   // 31-11: activation crouton addr
   // 10-5: filter/weight size
   // 4-0: chan start
-  aRs = (height - 1) << 8;
-  aRs |= (width - 1) << 5;
-
+  aRs = enc_filter_size;
 
   unsigned int a_addr = reinterpret_cast<unsigned int>(reinterpret_cast<uint8_t*>(in_a_t) + a_elem_offset);
   std::bitset<32> a_addr_b(a_addr);
@@ -94,14 +94,11 @@ extern "C" void device_api_hexagon_hmx_conv2d_u8(void* in_a_t, void* in_a_next_t
   // 4-0: chan end
   aRt = a_addr_offset & ~0x7ff;
   std::bitset<32> a_addr_offset_b(a_addr_offset);
-  aRt |= 0b111000 << 5;
-  aRt |= 31;  // TODO(nverke): MAKE channel - 1
+  aRt |= layout_enc;
+  aRt |= channels - 1;
 
   wRs = reinterpret_cast<unsigned int>(reinterpret_cast<uint8_t*>(in_w_t) + w_elem_offset);
-  wRt = 32 * 32;  // weight size
-  wRt *= width;
-  wRt *= height;
-  wRt -= 1;       // weight size
+  wRt = w_size;
 
   // FARF(LOW, "CallExtern Q6_activation_ub_mxmem_RR_deep_cm(%08x, %08x)", aRs, aRt);
   // FARF(LOW, "CallExtern Q6_weight_b_mxmem_RR_deep(%08x, %08x)", wRs, wRt);
@@ -124,12 +121,13 @@ extern "C" void device_api_hexagon_hmx_matmul_cvt_u8(void* out_o_t, int32_t o_el
   Q6_mxmem_AR_after_cm_sat_ub(out_ptr, cRt);
 }
 
-extern "C" void device_api_hexagon_hmx_conv2d_cvt_after_u8(void* out_o_t, int32_t elem_offset) {
+extern "C" void device_api_hexagon_hmx_conv2d_cvt_after_u8(void* out_o_t, int32_t elem_offset,
+                                                           int32_t layout_enc) {
   // RT_TRACE_PUT_REC(0, HMX_CVT_BEGIN);
   // TODO(HWE): Partial stores need extra support in the form of shifting.
   unsigned int cRt;
   auto out_ptr = reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(out_o_t) + elem_offset);
-  cRt = 0b111000 << 5;  // Spatial coordinates: HHHWWW
+  cRt = layout_enc;
   // FARF(LOW, "  Q6_mxmem_AR_after_cm_ub(%08x, %08x)\n", out_ptr, cRt);
   Q6_mxmem_AR_after_cm_sat_ub(out_ptr, cRt);
   // Q6_mxmem_AR_after_cm_ub(out_ptr, cRt);

diff --git a/tests/python/contrib/test_hexagon/conftest.py b/tests/python/contrib/test_hexagon/conftest.py
@@ -124,6 +124,7 @@ def compose_one_convolution_quantized_separate_layout_transforms(
     input_tile_offset,
     workload_padding,
     strides,
+    tile_shape_hw,
     mem_scope,
     allow_deep_mode,
 ):
@@ -136,6 +137,7 @@ def compose_one_convolution_quantized_separate_layout_transforms(
         tile_offset_A,
         workload_padding=workload_padding,
         strides=strides,
+        tile_shape_hw=tile_shape_hw,
         mem_scope=mem_scope,
         allow_deep_mode=allow_deep_mode
     )

diff --git a/tests/python/contrib/test_hexagon/hmx_qnn_conv2d_generator.py b/tests/python/contrib/test_hexagon/hmx_qnn_conv2d_generator.py
@@ -2,6 +2,34 @@
 from tvm import tir
 import tvm.script
 from tvm.script import tir as T
+from math import log2
+
+def get_raw_simple_spatial_mask(TH, TW):
+    assert (TH * TW) == 64, (
+        f"Tile spatial size {TH} x {TW} must equal 64."
+    )
+    w_offset = 6 - int(log2(TH))
+    return 0b111111 ^ ((1 << w_offset) - 1)
+
+def get_enc_filter_size_from_raw_mask(FH, FW, mask):
+    assert(FH > 0 and FH <= 8)
+    assert(FW > 0 and FW <= 8)
+    FW -= 1
+    FH -= 1
+    result = 0
+    for i in range(6):
+        if (mask & 1):
+            bit = (FH & 1)
+            FH >>= 1
+        else:
+            bit = (FW & 1)
+            FW >>= 1
+        result |= bit << i
+        mask >>= 1
+    return result
+
+def convert_spatial_encoding_cm(encoding):
+    return encoding << 5
 
 def make_hexagon_conv2d_quantized_no_layout_transform_nhwc(
     input_shape,
@@ -88,6 +116,11 @@ def make_hexagon_conv2d_quantized_no_layout_transform_nhwc(
     TH, TW = tile_shape_hw
     tile_shape = [1, TH, TW, 32]
 
+    spatial_mask = get_raw_simple_spatial_mask(TH, TW)
+    spatial_encoding = convert_spatial_encoding_cm(spatial_mask)
+    filter_size_encoding = get_enc_filter_size_from_raw_mask(FH, FW, spatial_mask)
+    filter_size_encoding = convert_spatial_encoding_cm(filter_size_encoding)
+
     AC_pad_h_low, AC_pad_w_low = input_tile_offset
     assert 0 <= AC_pad_h_low < TH, (
         f"Height offset {AC_pad_h_low} must be on range [0, {TH}).  "
@@ -150,9 +183,10 @@ def make_hexagon_conv2d_quantized_no_layout_transform_nhwc(
     output_tile_offset = (BC_pad_h_low, BC_pad_w_low, BC_pad_h_high, BC_pad_w_high)
 
     CI = 32
+    WCI = tir.min(tir.ceildiv(C, 4) * 4, CI)
     CO = tir.ceildiv(C, 32)
     CII = 4
-    CIO = tir.ceildiv(CI, CII)
+    CIO = tir.ceildiv(WCI, CII)
     AHO = tir.ceildiv(AC_pad_h_low + AH, TH)
     AWO = tir.ceildiv(AC_pad_w_low + AW, TW)
 
@@ -178,6 +212,7 @@ def make_hexagon_conv2d_quantized_no_layout_transform_nhwc(
         # tir.IndexMap.AXIS_SEPARATOR,
         *filter_chunk_shape,
     ]
+    weight_size = (FH * FW * CIO * KI * CII) - 1
     transformed_output_shape = [
         1,
         BHO,
@@ -193,7 +228,8 @@ def make_hexagon_conv2d_quantized_no_layout_transform_nhwc(
     N_offset_w = (AC_pad_w_low - PW_low + (FW - 1)) // TW
 
     # The physical hardware allows for 2 such accumulators, which limits
-    # the size of the filter to FW<=8.
+    # the size of the filter to FW<=TW.
+    assert(FW <= TW)
     NA = 1 + tir.ceildiv(FW, TW)
 
     accumulator_shape = [
@@ -297,7 +333,7 @@ def filter_OIHW_to_tiled(F_handle: T.handle, FC_handle: T.handle):
             with T.block("filter_OIHW_to_tiled"):
                 for ko, co, fh, fw, cio, ki, cii in T.grid(KO, CO, FH, FW, CIO, KI, CII):
                     ci = CII * cio + cii
-                    c = CI * co + ci
+                    c = WCI * co + ci
                     k = KI * ko + ki
                     FC[ko, co, fh, fw, cio, ki, cii] = T.if_then_else(
                         0 <= k and k < K and 0 <= c and c < C,
@@ -532,7 +568,7 @@ def main(
                                                 0:32,
                                             ],
                                             zero_crouton[0:TH, 0:TW, 0:32],
-                                            FC[vko, vco, 0:FH, 0:FW, 0:8, 0:32, 0:4],
+                                            FC[vko, vco, 0:FH, 0:FW, 0:WCI, 0:32, 0:4],
                                         )
                                         # Most reads will occur from AC_croutons[0], but
                                         # overflow in the +h direction will read from
@@ -565,11 +601,13 @@ def main(
                                                 FC[vko, vco, 0, 0, 0, 0, 0],
                                                 dtype="handle",
                                             ),
-                                            FH,
-                                            FW,
+                                            spatial_encoding,
+                                            filter_size_encoding,
+                                            WCI,
                                             AC.elem_offset,
                                             AC.elem_offset,
                                             FC.elem_offset,
+                                            weight_size,
                                             dtype="handle",
                                         )
                                         # if 0 <= awo and awo < AWO:
@@ -587,8 +625,8 @@ def main(
                                             bho,
                                             T.max(awoplus + AWO_START - N_offset_w, 0) : T.min(T.max(awoplus + AWO_START + 1 +N_offset_w, 0), BWO),
                                             ko,
-                                            0:8,
-                                            0:8,
+                                            0:TH,
+                                            0:TW,
                                             0:32,
                                         ],
                                         acc_junkyard[0, 0, 0, 0]
@@ -606,6 +644,7 @@ def main(
                                                 dtype="handle",
                                             ),
                                             BC.elem_offset,
+                                            spatial_encoding,
                                             dtype="handle"
                                         )
                                     else:
@@ -614,6 +653,7 @@ def main(
                                             "device_api_hexagon_hmx_conv2d_cvt_after_u8",
                                             T.address_of(acc_junkyard[0, 0, 0, 0], dtype="handle"),
                                             acc_junkyard.elem_offset,
+                                            spatial_encoding,
                                             dtype="handle"
                                         )
             else:
@@ -661,7 +701,6 @@ def main(
                                 dtype="handle",
                             )
 
-
     return mod, {
         "input_tile_offset": input_tile_offset,
         "input_shape": input_shape,

diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -438,6 +438,12 @@ class Conv2dHMXTestingBase:
 
     allow_deep_mode = tvm.testing.parameter(True)
 
+    tile_shape_hw = tvm.testing.parameter(
+        by_dict={
+            "8x8": (8, 8),
+        }
+    )
+
 
     quantization_parameters = tvm.testing.parameter(
         by_dict = {

diff --git a/tests/python/contrib/test_hexagon/test_qnn_conv2d_hmx.py b/tests/python/contrib/test_hexagon/test_qnn_conv2d_hmx.py
@@ -46,8 +46,6 @@ def schedule_func(ir_module, mem_scope, strides, filter_hw):
         sch.annotate(bho, "software_pipeline_async_stages", [0, 2])
     return sch.mod
 
-
-
 class QuantizedConv2dHMXBase(Conv2dHMXTestingBase):
     def evaluate(
         self, hexagon_session, compose_one_convolution_quantized_separate_layout_transforms, workload_padding, input_activation, input_weights, input_bias, quantization_parameters, reference_output_quantized, mem_scope, use_microkernel=False
@@ -153,3 +151,27 @@ def test_microkernel_benchmark_one_call(
             pytest.skip("Deep mode is not enabled in the microkernel")
 
         self.evaluate(hexagon_session, compose_one_convolution_quantized_separate_layout_transforms, workload_padding, input_activation, input_weights, input_bias, quantization_parameters, reference_output_quantized, mem_scope, use_microkernel=True)
+
+class TestQuantizedConv2dCustomTileShapes(TestQuantizedConv2d):
+    mem_scope = tvm.testing.parameter("global.vtcm")
+    input_channels = tvm.testing.parameter(64)
+    # output_channels = tvm.testing.parameter(256)
+    filter_hw = tvm.testing.parameter(3)
+    input_hw = tvm.testing.parameter(56)
+    input_tile_offset = tvm.testing.parameter(
+        # (0, 0),
+        (1, 1),
+    )
+    workload_padding = tvm.testing.parameter(
+        by_dict={
+            # "pad-0": (0, 0, 0, 0),
+            "pad-1": (1, 1, 1, 1),
+        }
+    )
+    tile_shape_hw = tvm.testing.parameter(
+        by_dict={
+            "8x8": (8, 8),
+            "4x16": (4, 16),
+            "16x4": (16, 4),
+        }
+    )