Add optimised 'Indirect BGEMM' binary convolution kernels.

To start, add portable 4x2 C++ kernels for float/int8/bitpacked output. Facilitate easy implementation of new indirect bgemm kernels, including architecture-specific variations.
larq · AdamHillier · Sep 29, 2020 · Sep 22, 2020 · Sep 29, 2020 · c057b6fa82c0695632142e815a3c65366b56a01e
commit c057b6fa82c0695632142e815a3c65366b56a01e
diff --git a/larq_compute_engine/core/bconv2d/BUILD b/larq_compute_engine/core/bconv2d/BUILD
@@ -31,9 +31,9 @@ cc_library(
 )
 
 cc_library(
-    name = "optimized",
+    name = "optimized_bgemm",
     hdrs = [
-        "optimized.h",
+        "optimized_bgemm.h",
     ],
     deps = [
         ":zero_padding_correction",
@@ -45,3 +45,20 @@ cc_library(
         "@ruy//ruy/profiler:instrumentation",
     ],
 )
+
+cc_library(
+    name = "optimized_indirect_bgemm",
+    hdrs = [
+        "optimized_indirect_bgemm.h",
+    ],
+    deps = [
+        ":zero_padding_correction",
+        "//larq_compute_engine/core/indirect_bgemm:kernels",
+        "//larq_compute_engine/core/indirect_bgemm:prepare",
+        "@org_tensorflow//tensorflow/lite/kernels:cpu_backend_context",
+        "@org_tensorflow//tensorflow/lite/kernels:cpu_backend_gemm",
+        "@org_tensorflow//tensorflow/lite/kernels:padding",
+        "@org_tensorflow//tensorflow/lite/kernels/internal:optimized_base",
+        "@ruy//ruy/profiler:instrumentation",
+    ],
+)
diff --git a/larq_compute_engine/core/bconv2d/optimized.h → ...ute_engine/core/bconv2d/optimized_bgemm.h b/larq_compute_engine/core/bconv2d/optimized.h → ...ute_engine/core/bconv2d/optimized_bgemm.h
@@ -1,5 +1,5 @@
-#ifndef COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_H_
-#define COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_H_
+#ifndef COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_BGEMM_H_
+#define COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_BGEMM_H_
 
 #include "larq_compute_engine/core/bconv2d/zero_padding_correction.h"
 #include "larq_compute_engine/core/bgemm/bgemm.h"
@@ -61,7 +61,7 @@ inline void im2col(const ConvParams& params, const RuntimeShape& input_shape,
 }
 
 template <typename AccumScalar, typename DstScalar>
-inline void BConv2DOptimized(
+inline void BConv2DOptimizedBGEMM(
     const ConvParams& params, const RuntimeShape& input_shape,
     const TBitpacked* input_data, const RuntimeShape& filter_shape,
     const TBitpacked* packed_filter_data,
@@ -152,6 +152,8 @@ inline void BConv2DOptimized(
 
   if (std::is_same<DstScalar, float>::value &&
       params.padding_type == PaddingType::kSame && pad_value == 0) {
+    ruy::profiler::ScopeLabel label("Zero padding correction");
+
     const int stride_width = params.stride_width;
     const int stride_height = params.stride_height;
     const int dilation_width_factor = params.dilation_width_factor;
@@ -166,20 +168,17 @@ inline void BConv2DOptimized(
     const int output_width = output_shape.Dims(2);
     const int output_height = output_shape.Dims(1);
 
-    {
-      ruy::profiler::ScopeLabel label("Zero padding correction");
-      zero_padding_correction::ApplyCorrection(
-          batches, input_height, input_width, input_depth, filter_height,
-          filter_width, output_depth, stride_height, stride_width,
-          dilation_height_factor, dilation_width_factor,
-          reinterpret_cast<float*>(output_data), output_height, output_width,
-          padding_buffer);
-    }
+    zero_padding_correction::ApplyCorrection(
+        batches, input_height, input_width, input_depth, filter_height,
+        filter_width, output_depth, stride_height, stride_width,
+        dilation_height_factor, dilation_width_factor,
+        reinterpret_cast<float*>(output_data), output_height, output_width,
+        padding_buffer);
   }
 }
 
 }  // namespace bconv2d
 }  // namespace core
 }  // namespace compute_engine
 
-#endif  // COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_H_
+#endif  // COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_BGEMM_H_
diff --git a/larq_compute_engine/core/bconv2d/optimized_indirect_bgemm.h b/larq_compute_engine/core/bconv2d/optimized_indirect_bgemm.h
@@ -0,0 +1,68 @@
+#ifndef COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_INDIRECT_BGEMM_H_
+#define COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_INDIRECT_BGEMM_H_
+
+#include "larq_compute_engine/core/bconv2d/zero_padding_correction.h"
+#include "larq_compute_engine/core/indirect_bgemm/kernel.h"
+#include "ruy/profiler/instrumentation.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace compute_engine {
+namespace core {
+namespace bconv2d {
+
+template <typename AccumScalar, typename DstScalar>
+inline void BConv2DOptimizedIndirectBGEMM(
+    const indirect_bgemm::IndirectBGEMMKernel<DstScalar> kernel,
+    const compute_engine::tflite::bconv2d::TfLiteBConv2DParams* conv_params,
+    const RuntimeShape& bitpacked_input_shape, const RuntimeShape& output_shape,
+    const OutputTransform<DstScalar>& output_transform,
+    const TBitpacked* packed_weights, const TBitpacked** indirection_buffer,
+    DstScalar* output_data, const float* padding_buffer, const int pad_value) {
+  TF_LITE_ASSERT_EQ(bitpacked_input_shape.DimensionsCount(), 4);
+  TF_LITE_ASSERT_EQ(output_shape.DimensionsCount(), 4);
+
+  ruy::profiler::ScopeLabel label("BConv2D (optimized, indirect BGEMM)");
+
+  const std::int32_t conv_kernel_size =
+      conv_params->filter_height * conv_params->filter_width;
+  const std::int32_t bitpacked_input_channels = bitpacked_input_shape.Dims(3);
+  const std::int32_t output_size = output_shape.Dims(1) * output_shape.Dims(2);
+  const std::int32_t output_channels = conv_params->channels_out;
+
+  indirect_bgemm::RunKernel(kernel, conv_kernel_size, bitpacked_input_channels,
+                            output_size, output_channels, output_transform,
+                            packed_weights, indirection_buffer, output_data);
+
+  if (std::is_same<DstScalar, float>::value &&
+      conv_params->padding_type == TfLitePadding::kTfLitePaddingSame &&
+      pad_value == 0) {
+    ruy::profiler::ScopeLabel label("Zero padding correction");
+
+    const int stride_width = conv_params->stride_width;
+    const int stride_height = conv_params->stride_height;
+    const int dilation_width_factor = conv_params->dilation_width_factor;
+    const int dilation_height_factor = conv_params->dilation_height_factor;
+    const int batches = MatchingDim(bitpacked_input_shape, 0, output_shape, 0);
+    const int input_depth = conv_params->channels_in;
+    const int input_width = bitpacked_input_shape.Dims(2);
+    const int input_height = bitpacked_input_shape.Dims(1);
+    const int filter_height = conv_params->filter_height;
+    const int filter_width = conv_params->filter_width;
+    const int output_depth = output_shape.Dims(3);
+    const int output_width = output_shape.Dims(2);
+    const int output_height = output_shape.Dims(1);
+
+    zero_padding_correction::ApplyCorrection(
+        batches, input_height, input_width, input_depth, filter_height,
+        filter_width, output_depth, stride_height, stride_width,
+        dilation_height_factor, dilation_width_factor,
+        reinterpret_cast<float*>(output_data), output_height, output_width,
+        padding_buffer);
+  }
+}
+
+}  // namespace bconv2d
+}  // namespace core
+}  // namespace compute_engine
+
+#endif  // COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_INDIRECT_BGEMM_H_
diff --git a/larq_compute_engine/core/indirect_bgemm/BUILD b/larq_compute_engine/core/indirect_bgemm/BUILD
@@ -0,0 +1,30 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "prepare",
+    hdrs = [
+        "prepare.h",
+    ],
+    deps = [
+        "//larq_compute_engine/core:types",
+        "//larq_compute_engine/tflite/kernels:bconv2d_params",
+        "@org_tensorflow//tensorflow/lite/kernels/internal:types",
+    ],
+)
+
+cc_library(
+    name = "kernels",
+    hdrs = [
+        "kernel.h",
+        "kernel_4x2_portable.h",
+    ],
+    deps = [
+        "//larq_compute_engine/core:types",
+        "//larq_compute_engine/core/bconv2d:output_transform",
+        "//larq_compute_engine/tflite/kernels:bconv2d_params",
+        "@org_tensorflow//tensorflow/lite/kernels/internal:types",
+        "@ruy//ruy/profiler:instrumentation",
+    ],
+)
diff --git a/larq_compute_engine/core/indirect_bgemm/kernel.h b/larq_compute_engine/core/indirect_bgemm/kernel.h
@@ -0,0 +1,79 @@
+
+#ifndef COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_H_
+#define COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "larq_compute_engine/core/indirect_bgemm/kernel_4x2_portable.h"
+#include "larq_compute_engine/core/types.h"
+#include "larq_compute_engine/tflite/kernels/bconv2d_params.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+using namespace tflite;
+
+namespace compute_engine {
+namespace core {
+namespace indirect_bgemm {
+
+using compute_engine::tflite::bconv2d::TfLiteBConv2DParams;
+
+template <typename DstScalar>
+struct IndirectBGEMMKernel {
+  using MicroKernelFunction = void(const std::int32_t, const std::int32_t,
+                                   const std::int32_t, const std::int32_t,
+                                   const bconv2d::OutputTransform<DstScalar>&,
+                                   const TBitpacked*, const TBitpacked**,
+                                   DstScalar*);
+  MicroKernelFunction* micro_kernel_function;
+  const std::int32_t block_size_output_channels;
+  const std::int32_t block_size_pixels;
+};
+
+// This function allows us to select which kernel to use at runtime based on any
+// parameter we choose: destination scalar; conv params; input/output shapes;
+// even detected CPU features.
+//     It is very important that this function is deterministic, as we rely on
+// the fact that the same kernel is selected for each call to `Eval` (as long as
+// the input shape doesn't change).
+template <typename DstScalar>
+inline IndirectBGEMMKernel<DstScalar> SelectRuntimeKernel(
+    const TfLiteBConv2DParams* conv_params,
+    const RuntimeShape& bitpacked_input_shape,
+    const RuntimeShape& output_shape) {
+  // For now there is only one kernel available.
+  return IndirectBGEMMKernel<DstScalar>{
+      &kernel_4x2_portable::RunKernel<DstScalar>, 4, 2};
+}
+
+template <typename DstScalar>
+void RunKernel(const IndirectBGEMMKernel<DstScalar>& kernel,
+               const std::int32_t conv_kernel_size,
+               const std::int32_t bitpacked_input_channels,
+               const std::int32_t output_size,
+               const std::int32_t output_channels,
+               const bconv2d::OutputTransform<DstScalar>& output_transform,
+               const TBitpacked* packed_weights_ptr,
+               const TBitpacked** indirection_buffer, DstScalar* output_ptr) {
+  // TODO: implement multithreading here.
+  for (std::int32_t pixel_start = 0; pixel_start < output_size;
+       pixel_start += kernel.block_size_pixels) {
+    const std::int32_t output_stride =
+        std::is_same<DstScalar, TBitpacked>::value
+            ? bitpacking::GetBitpackedSize(output_channels)
+            : output_channels;
+    kernel.micro_kernel_function(
+        std::min(output_size - pixel_start, kernel.block_size_pixels),
+        conv_kernel_size, bitpacked_input_channels, output_channels,
+        output_transform, packed_weights_ptr,
+        indirection_buffer + pixel_start * conv_kernel_size,
+        output_ptr + pixel_start * output_stride);
+  }
+}
+
+}  // namespace indirect_bgemm
+}  // namespace core
+}  // namespace compute_engine
+
+#endif  // COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_H_
diff --git a/larq_compute_engine/core/indirect_bgemm/kernel_4x2_portable.h b/larq_compute_engine/core/indirect_bgemm/kernel_4x2_portable.h
@@ -0,0 +1,245 @@
+#ifndef COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_4x2_PORTABLE_H_
+#define COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_4x2_PORTABLE_H_
+
+#include <cstdint>
+#include <type_traits>
+
+#include "larq_compute_engine/core/bconv2d/output_transform.h"
+#include "larq_compute_engine/core/types.h"
+#include "ruy/profiler/instrumentation.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace compute_engine {
+namespace core {
+namespace indirect_bgemm {
+namespace kernel_4x2_portable {
+
+/**
+ * A 4x2 C++ micro-kernel for float or int8 output.
+ */
+template <typename DstScalar>
+void RunKernel(const std::int32_t block_num_pixels,
+               const std::int32_t conv_kernel_size,
+               const std::int32_t channels_in, const std::int32_t channels_out,
+               const bconv2d::OutputTransform<DstScalar>& output_transform,
+               const TBitpacked* weights_ptr,
+               const TBitpacked** indirection_buffer, DstScalar* output_ptr) {
+  static_assert(std::is_same<DstScalar, float>::value ||
+                    std::is_same<DstScalar, std::int8_t>::value,
+                "");
+
+  ruy::profiler::ScopeLabel label("Indirect BGEMM block (4x2, portable)");
+
+  TFLITE_DCHECK_GE(block_num_pixels, 1);
+  TFLITE_DCHECK_LE(block_num_pixels, 2);
+  TFLITE_DCHECK_GE(conv_kernel_size, 1);
+  TFLITE_DCHECK_GE(channels_in, 1);
+  TFLITE_DCHECK_GE(channels_out, 1);
+
+  DstScalar* output_ptr_0 = output_ptr;
+  DstScalar* output_ptr_1 = output_ptr + channels_out;
+
+  // At the end of the output array we might get a block where the number of
+  // pixels is less than 2, if the overall output size is not a multiple of 2.
+  // When this happens we set the 'leftover' output pointer equal to the first
+  // output pointer, so that there's no risk of writing beyond the array bounds.
+  // At the end, when we write to the output array, we do it 'back to front' so
+  // that the outputs for the first pixel are written last, which means that the
+  // result will still be correct.
+  if (block_num_pixels < 2) {
+    output_ptr_1 = output_ptr_0;
+  }
+
+  std::int32_t c_out_index = 0;
+  do {
+    // Accumulators
+    std::int32_t acc_00 = 0, acc_01 = 0;
+    std::int32_t acc_10 = 0, acc_11 = 0;
+    std::int32_t acc_20 = 0, acc_21 = 0;
+    std::int32_t acc_30 = 0, acc_31 = 0;
+
+    std::int32_t k_size_index = conv_kernel_size;
+    do {
+      const TBitpacked* activations_ptr_0 = indirection_buffer[0];
+      const TBitpacked* activations_ptr_1 = indirection_buffer[1];
+      indirection_buffer += 2;
+
+      std::int32_t c_in_index = channels_in;
+      do {
+        const TBitpacked w_0 = weights_ptr[0];
+        const TBitpacked w_1 = weights_ptr[1];
+        const TBitpacked w_2 = weights_ptr[2];
+        const TBitpacked w_3 = weights_ptr[3];
+        weights_ptr += 4;
+
+        const TBitpacked a_0 = *activations_ptr_0++;
+        const TBitpacked a_1 = *activations_ptr_1++;
+
+        acc_00 += xor_popcount(w_0, a_0);
+        acc_10 += xor_popcount(w_1, a_0);
+        acc_20 += xor_popcount(w_2, a_0);
+        acc_30 += xor_popcount(w_3, a_0);
+        acc_01 += xor_popcount(w_0, a_1);
+        acc_11 += xor_popcount(w_1, a_1);
+        acc_21 += xor_popcount(w_2, a_1);
+        acc_31 += xor_popcount(w_3, a_1);
+      } while (--c_in_index > 0);
+    } while (--k_size_index > 0);
+
+    if (channels_out - c_out_index >= 4) {
+      output_ptr_1[0] = output_transform.Run(acc_01, c_out_index);
+      output_ptr_1[1] = output_transform.Run(acc_11, c_out_index + 1);
+      output_ptr_1[2] = output_transform.Run(acc_21, c_out_index + 2);
+      output_ptr_1[3] = output_transform.Run(acc_31, c_out_index + 3);
+      output_ptr_1 += 4;
+      output_ptr_0[0] = output_transform.Run(acc_00, c_out_index);
+      output_ptr_0[1] = output_transform.Run(acc_10, c_out_index + 1);
+      output_ptr_0[2] = output_transform.Run(acc_20, c_out_index + 2);
+      output_ptr_0[3] = output_transform.Run(acc_30, c_out_index + 3);
+      output_ptr_0 += 4;
+
+      indirection_buffer -= 2 * conv_kernel_size;
+      c_out_index += 4;
+    } else {
+      if (channels_out - c_out_index >= 2) {
+        output_ptr_1[0] = output_transform.Run(acc_01, c_out_index);
+        output_ptr_1[1] = output_transform.Run(acc_11, c_out_index + 1);
+        output_ptr_1 += 2;
+        output_ptr_0[0] = output_transform.Run(acc_00, c_out_index);
+        output_ptr_0[1] = output_transform.Run(acc_10, c_out_index + 1);
+        output_ptr_0 += 2;
+
+        acc_01 = acc_21;
+        acc_00 = acc_20;
+        c_out_index += 2;
+      }
+      if (channels_out - c_out_index >= 1) {
+        output_ptr_1[0] = output_transform.Run(acc_01, c_out_index);
+        output_ptr_0[0] = output_transform.Run(acc_00, c_out_index);
+      }
+
+      c_out_index = channels_out;
+    }
+  } while (c_out_index < channels_out);
+}
+
+/**
+ * A 4x2 C++ micro-kernel for bitpacked output.
+ */
+template <>
+void RunKernel<TBitpacked>(
+    const std::int32_t block_num_pixels, const std::int32_t conv_kernel_size,
+    const std::int32_t channels_in, const std::int32_t channels_out,
+    const bconv2d::OutputTransform<TBitpacked>& output_transform,
+    const TBitpacked* weights_ptr, const TBitpacked** indirection_buffer,
+    TBitpacked* output_ptr) {
+  ruy::profiler::ScopeLabel label("Indirect BGEMM block (4x2, portable)");
+
+  TFLITE_DCHECK_GE(block_num_pixels, 1);
+  TFLITE_DCHECK_LE(block_num_pixels, 2);
+  TFLITE_DCHECK_GE(conv_kernel_size, 1);
+  TFLITE_DCHECK_GE(channels_in, 1);
+  TFLITE_DCHECK_GE(channels_out, 1);
+
+  TBitpacked* output_ptr_0 = output_ptr;
+  TBitpacked* output_ptr_1 =
+      output_ptr + bitpacking::GetBitpackedSize(channels_out);
+
+  // At the end of the output array we might get a block where the number of
+  // pixels is less than 2, if the overall output size is not a multiple of 2.
+  // When this happens we set the 'leftover' output pointer equal to the first
+  // output pointer, so that there's no risk of writing beyond the array bounds.
+  // At the end, when we write to the output array, we do it 'back to front' so
+  // that the outputs for the first pixel are written last, which means that the
+  // result will still be correct.
+  if (block_num_pixels < 2) {
+    output_ptr_1 = output_ptr_0;
+  }
+
+  // We will accumulate bits into these per-pixel columns and write a bitpacked
+  // value when the columns are full.
+  TBitpacked output_col_0 = 0, output_col_1 = 0;
+
+  std::int32_t c_out_index = 0;
+  do {
+    // Accumulators
+    std::int32_t acc_00 = 0, acc_01 = 0;
+    std::int32_t acc_10 = 0, acc_11 = 0;
+    std::int32_t acc_20 = 0, acc_21 = 0;
+    std::int32_t acc_30 = 0, acc_31 = 0;
+
+    std::int32_t k_size_index = conv_kernel_size;
+    do {
+      const TBitpacked* activations_ptr_0 = indirection_buffer[0];
+      const TBitpacked* activations_ptr_1 = indirection_buffer[1];
+      indirection_buffer += 2;
+
+      std::int32_t c_in_index = channels_in;
+      do {
+        const TBitpacked w_0 = weights_ptr[0];
+        const TBitpacked w_1 = weights_ptr[1];
+        const TBitpacked w_2 = weights_ptr[2];
+        const TBitpacked w_3 = weights_ptr[3];
+        weights_ptr += 4;
+
+        const TBitpacked a_0 = *activations_ptr_0++;
+        const TBitpacked a_1 = *activations_ptr_1++;
+
+        acc_00 += xor_popcount(w_0, a_0);
+        acc_10 += xor_popcount(w_1, a_0);
+        acc_20 += xor_popcount(w_2, a_0);
+        acc_30 += xor_popcount(w_3, a_0);
+        acc_01 += xor_popcount(w_0, a_1);
+        acc_11 += xor_popcount(w_1, a_1);
+        acc_21 += xor_popcount(w_2, a_1);
+        acc_31 += xor_popcount(w_3, a_1);
+      } while (--c_in_index > 0);
+    } while (--k_size_index > 0);
+
+    output_col_0 |= TBitpacked(output_transform.Run(acc_00, c_out_index))
+                    << (c_out_index % bitpacking_bitwidth);
+    output_col_0 |= TBitpacked(output_transform.Run(acc_10, c_out_index + 1))
+                    << ((c_out_index + 1) % bitpacking_bitwidth);
+    output_col_0 |= TBitpacked(output_transform.Run(acc_20, c_out_index + 2))
+                    << ((c_out_index + 2) % bitpacking_bitwidth);
+    output_col_0 |= TBitpacked(output_transform.Run(acc_30, c_out_index + 3))
+                    << ((c_out_index + 3) % bitpacking_bitwidth);
+    output_col_1 |= TBitpacked(output_transform.Run(acc_01, c_out_index))
+                    << (c_out_index % bitpacking_bitwidth);
+    output_col_1 |= TBitpacked(output_transform.Run(acc_11, c_out_index + 1))
+                    << ((c_out_index + 1) % bitpacking_bitwidth);
+    output_col_1 |= TBitpacked(output_transform.Run(acc_21, c_out_index + 2))
+                    << ((c_out_index + 2) % bitpacking_bitwidth);
+    output_col_1 |= TBitpacked(output_transform.Run(acc_31, c_out_index + 3))
+                    << ((c_out_index + 3) % bitpacking_bitwidth);
+
+    indirection_buffer -= 2 * conv_kernel_size;
+    c_out_index += 4;
+
+    // Write the bitpacked columns whenever they are full, or if we've computed
+    // the last output column value.
+    if (c_out_index % bitpacking_bitwidth == 0 || c_out_index >= channels_out) {
+      // If this is a 'leftover output channel' block (because the number of
+      // output channels isn't a multiple of four) then zero-out the extra bits.
+      if (c_out_index % bitpacking_bitwidth != 0) {
+        output_col_0 &=
+            (TBitpacked(1) << (channels_out % bitpacking_bitwidth)) - 1;
+        output_col_1 &=
+            (TBitpacked(1) << (channels_out % bitpacking_bitwidth)) - 1;
+      }
+
+      *output_ptr_1++ = output_col_1;
+      output_col_1 = 0;
+      *output_ptr_0++ = output_col_0;
+      output_col_0 = 0;
+    }
+  } while (c_out_index < channels_out);
+}
+
+}  // namespace kernel_4x2_portable
+}  // namespace indirect_bgemm
+}  // namespace core
+}  // namespace compute_engine
+
+#endif  // COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_4x2_PORTABLE_H_
diff --git a/larq_compute_engine/core/indirect_bgemm/prepare.h b/larq_compute_engine/core/indirect_bgemm/prepare.h
@@ -0,0 +1,145 @@
+#ifndef COMPUTE_ENGINE_INDIRECT_BGEMM_PREPARE_H_
+#define COMPUTE_ENGINE_INDIRECT_BGEMM_PREPARE_H_
+
+#include <cstdint>
+
+#include "larq_compute_engine/core/types.h"
+#include "larq_compute_engine/tflite/kernels/bconv2d_params.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+using namespace tflite;
+
+namespace compute_engine {
+namespace core {
+namespace indirect_bgemm {
+
+// This function is (heavily) adapted from this XNNPack function:
+// https://github.com/google/XNNPACK/blob/80a8ac59849bfdae8d2e1409f5642baa502c0b9e/src/indirection.c#L18-L76
+void FillIndirectionBuffer(const int block_size_pixels,
+                           const TfLiteBConv2DParams* conv_params,
+                           const RuntimeShape& bitpacked_input_shape,
+                           const RuntimeShape& output_shape,
+                           const TBitpacked* input_ptr,
+                           std::vector<const TBitpacked*>& indirection_buffer,
+                           std::vector<TBitpacked>& zero_buffer) {
+  using std::int32_t;
+
+  const int32_t kernel_height = conv_params->filter_height;
+  const int32_t kernel_width = conv_params->filter_width;
+  const int32_t stride_height = conv_params->stride_height;
+  const int32_t stride_width = conv_params->stride_width;
+  const int32_t dilation_height = conv_params->dilation_height_factor;
+  const int32_t dilation_width = conv_params->dilation_width_factor;
+  const int32_t input_padding_top = conv_params->padding_values.height;
+  const int32_t input_padding_left = conv_params->padding_values.width;
+
+  const int32_t input_height = bitpacked_input_shape.Dims(1);
+  const int32_t input_width = bitpacked_input_shape.Dims(2);
+  const int32_t bitpacked_input_channels = bitpacked_input_shape.Dims(3);
+
+  const int32_t output_height = output_shape.Dims(1);
+  const int32_t output_width = output_shape.Dims(2);
+
+  const int32_t output_size = output_height * output_width;
+  const int32_t kernel_size = kernel_height * kernel_width;
+  const int32_t tiled_output_size =
+      block_size_pixels *
+      ((output_size + block_size_pixels - 1) / block_size_pixels);
+
+  indirection_buffer.resize(tiled_output_size * kernel_size);
+  zero_buffer.assign(kernel_size * bitpacked_input_channels, 0);
+
+  for (int32_t output_tile_start = 0; output_tile_start < tiled_output_size;
+       output_tile_start += block_size_pixels) {
+    for (int32_t output_tile_offset = 0; output_tile_offset < block_size_pixels;
+         output_tile_offset++) {
+      const int32_t output_index =
+          std::min(output_tile_start + output_tile_offset, output_size - 1);
+      const int32_t output_x = output_index % output_width;
+      const int32_t output_y = output_index / output_width;
+      for (int32_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
+        const int32_t input_y = output_y * stride_height +
+                                kernel_y * dilation_height - input_padding_top;
+        if (0 <= input_y && input_y < input_height) {
+          for (int32_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
+            const int32_t input_x = output_x * stride_width +
+                                    kernel_x * dilation_width -
+                                    input_padding_left;
+            const int32_t kernel_index = kernel_y * kernel_width + kernel_x;
+            const int32_t index = output_tile_start * kernel_size +
+                                  kernel_index * block_size_pixels +
+                                  output_tile_offset;
+            if (0 <= input_x && input_x < input_width) {
+              indirection_buffer.at(index) =
+                  (input_ptr + (input_y * input_width + input_x) *
+                                   bitpacked_input_channels);
+            } else {
+              indirection_buffer.at(index) = zero_buffer.data();
+            }
+          }
+        } else {
+          for (int32_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
+            const int32_t kernel_index = kernel_y * kernel_width + kernel_x;
+            const int32_t index = output_tile_start * kernel_size +
+                                  kernel_index * block_size_pixels +
+                                  output_tile_offset;
+            indirection_buffer.at(index) = zero_buffer.data();
+          }
+        }
+      }
+    }
+  }
+}
+
+// This function is (heavily) adapted from this XNNPack function:
+// https://github.com/google/XNNPACK/blob/80a8ac59849bfdae8d2e1409f5642baa502c0b9e/src/packing.c#L429-L484
+void PackWeights(const int block_size_output_channels,
+                 const TfLiteBConv2DParams* conv_params,
+                 const RuntimeShape& bitpacked_input_shape,
+                 const RuntimeShape& output_shape,
+                 const TBitpacked* weights_ptr,
+                 std::vector<TBitpacked>& packed_weights) {
+  using std::int32_t;
+
+  const int32_t bitpacked_input_channels = bitpacked_input_shape.Dims(3);
+  const int32_t output_channels = conv_params->channels_out;
+  const int32_t kernel_size =
+      conv_params->filter_height * conv_params->filter_width;
+
+  const int32_t rounded_up_output_channels =
+      block_size_output_channels *
+      ((output_channels + block_size_output_channels - 1) /
+       block_size_output_channels);
+
+  packed_weights.resize(rounded_up_output_channels * kernel_size *
+                        bitpacked_input_channels);
+
+  int32_t packed_weights_index = 0;
+
+  for (int32_t block_start = 0; block_start < output_channels;
+       block_start += block_size_output_channels) {
+    const int32_t block_size =
+        std::min(output_channels - block_start, block_size_output_channels);
+    for (int32_t ki = 0; ki < kernel_size; ki++) {
+      for (int32_t ci = 0; ci < bitpacked_input_channels; ci++) {
+        for (int32_t block_offset = 0; block_offset < block_size;
+             block_offset++) {
+          const int32_t weights_index = (block_start + block_offset) *
+                                            kernel_size *
+                                            bitpacked_input_channels +
+                                        ki * bitpacked_input_channels + ci;
+          packed_weights.at(packed_weights_index++) =
+              weights_ptr[weights_index];
+        }
+        packed_weights_index += block_size_output_channels - block_size;
+      }
+    }
+  }
+}
+
+}  // namespace indirect_bgemm
+}  // namespace core
+}  // namespace compute_engine
+
+#endif  // COMPUTE_ENGINE_INDIRECT_BGEMM_PREPARE_H_
diff --git a/larq_compute_engine/tflite/kernels/BUILD b/larq_compute_engine/tflite/kernels/BUILD
@@ -12,6 +12,9 @@ cc_library(
     hdrs = [
         "bconv2d_params.h",
     ],
+    deps = [
+        "//larq_compute_engine/core:types",
+    ],
 )
 
 cc_library(
@@ -57,10 +60,13 @@ cc_library(
         ":bconv2d_params",
         ":utils",
         "//larq_compute_engine/core:bmaxpool",
-        "//larq_compute_engine/core/bconv2d:optimized",
+        "//larq_compute_engine/core/bconv2d:optimized_bgemm",
+        "//larq_compute_engine/core/bconv2d:optimized_indirect_bgemm",
         "//larq_compute_engine/core/bconv2d:reference",
         "//larq_compute_engine/core/bitpacking:bitpack",
         "//larq_compute_engine/core/bitpacking:utils",
+        "//larq_compute_engine/core/indirect_bgemm:kernels",
+        "//larq_compute_engine/core/indirect_bgemm:prepare",
         "@flatbuffers",
         "@org_tensorflow//tensorflow/lite:framework",
         "@org_tensorflow//tensorflow/lite/kernels/internal:kernel_utils",

diff --git a/larq_compute_engine/tflite/kernels/bconv2d.cc b/larq_compute_engine/tflite/kernels/bconv2d.cc
@@ -2,9 +2,12 @@
 #include <cstdint>
 
 #include "flatbuffers/flexbuffers.h"
-#include "larq_compute_engine/core/bconv2d/optimized.h"
+#include "larq_compute_engine/core/bconv2d/optimized_bgemm.h"
+#include "larq_compute_engine/core/bconv2d/optimized_indirect_bgemm.h"
 #include "larq_compute_engine/core/bconv2d/reference.h"
 #include "larq_compute_engine/core/bconv2d/zero_padding_correction.h"
+#include "larq_compute_engine/core/indirect_bgemm/kernel.h"
+#include "larq_compute_engine/core/indirect_bgemm/prepare.h"
 #include "larq_compute_engine/core/types.h"
 #include "larq_compute_engine/tflite/kernels/bconv2d_output_transform_utils.h"
 #include "larq_compute_engine/tflite/kernels/bconv2d_params.h"
@@ -25,11 +28,14 @@ namespace bconv2d {
 using namespace core::bitpacking;
 
 enum class KernelType {
-  // kGenericRef: the reference implementation without im2col
-  kGenericRef,
+  // The reference implementation with for-loops.
+  kReference,
 
-  // kRuyOptimized: the Ruy implementation with hand-optimized BGEMM kernels.
-  kRuyOptimized,
+  // The Ruy implementation with im2col and hand-optimized BGEMM kernels.
+  kOptimizedBGEMM,
+
+  // The XNNPack-derived implementation with indirect BGEMM kernels.
+  kOptimizedIndirectBGEMM,
 };
 
 #define LCE_ENSURE_PARAM(conv_params, context, a)                       \
@@ -171,12 +177,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         "8-bit quantization is only supported with valid or one-padding");
   }
 
-  if (kernel_type == KernelType::kGenericRef) {
+  if (kernel_type == KernelType::kReference) {
     TF_LITE_ENSURE_MSG(
         context,
         conv_params->padding_type != kTfLitePaddingSame ||
             conv_params->pad_value == 1,
         "The reference kernel only supports valid or one-padding.");
+  } else if (kernel_type == KernelType::kOptimizedIndirectBGEMM) {
+    TF_LITE_ENSURE_MSG(
+        context, input->allocation_type != kTfLiteDynamic,
+        "The input tensor must not have dynamic allocation type");
   }
 
   // Determine the output dimensions and allocate the output buffer
@@ -193,7 +203,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int temporaries_count = 0;
 
   const bool need_im2col =
-      kernel_type == KernelType::kRuyOptimized &&
+      kernel_type == KernelType::kOptimizedBGEMM &&
       (conv_params->stride_width != 1 || conv_params->stride_height != 1 ||
        conv_params->dilation_width_factor != 1 ||
        conv_params->dilation_height_factor != 1 ||
@@ -335,8 +345,8 @@ void OneTimeSetup(TfLiteContext* context, TfLiteNode* node,
 }
 
 template <typename AccumScalar, typename DstScalar>
-void EvalOpt(TfLiteContext* context, TfLiteNode* node,
-             TfLiteBConv2DParams* params) {
+void EvalOptBGEMM(TfLiteContext* context, TfLiteNode* node,
+                  TfLiteBConv2DParams* params) {
   if (!params->one_time_setup_complete) {
     OneTimeSetup(context, node, params);
   }
@@ -369,7 +379,7 @@ void EvalOpt(TfLiteContext* context, TfLiteNode* node,
   // weights data.
   //     Likewise, we pass the original output shape even if we are going to
   // write bitpacked output directly.
-  core::bconv2d::BConv2DOptimized<AccumScalar, DstScalar>(
+  core::bconv2d::BConv2DOptimizedBGEMM<AccumScalar, DstScalar>(
       op_params, GetTensorShape(input), GetTensorData<TBitpacked>(input),
       GetTensorShape(filter), GetTensorData<TBitpacked>(filter),
       output_transform, unpacked_output_shape, GetTensorData<DstScalar>(output),
@@ -379,16 +389,56 @@ void EvalOpt(TfLiteContext* context, TfLiteNode* node,
 }
 
 template <typename DstScalar>
-void EvalRef(TfLiteContext* context, TfLiteNode* node,
-             TfLiteBConv2DParams* params) {
+void EvalOptIndirectBGEMM(TfLiteContext* context, TfLiteNode* node,
+                          TfLiteBConv2DParams* conv_params) {
+  bool kernel_is_initialized = true;
+  if (!conv_params->one_time_setup_complete) {
+    OneTimeSetup(context, node, conv_params);
+    kernel_is_initialized = false;
+  }
+
   const auto* input = GetInput(context, node, 0);
-  const auto* packed_filter = GetInput(context, node, 1);
+  const auto* filter = GetInput(context, node, 1);
   auto* output = GetOutput(context, node, 0);
 
+  const auto bitpacked_input_shape = GetTensorShape(input);
+  const auto output_shape = GetTensorShape(output);
+
+  const auto kernel = core::indirect_bgemm::SelectRuntimeKernel<DstScalar>(
+      conv_params, bitpacked_input_shape, output_shape);
+
+  OutputTransform<DstScalar> output_transform;
+  GetOutputTransform(output_transform, context, node, conv_params);
+
+  if (!kernel_is_initialized) {
+    core::indirect_bgemm::FillIndirectionBuffer(
+        kernel.block_size_pixels, conv_params, bitpacked_input_shape,
+        output_shape, GetTensorData<TBitpacked>(input),
+        conv_params->indirection_buffer, conv_params->zero_buffer);
+    core::indirect_bgemm::PackWeights(
+        kernel.block_size_output_channels, conv_params, bitpacked_input_shape,
+        output_shape, GetTensorData<TBitpacked>(filter),
+        conv_params->packed_weights);
+  }
+
+  core::bconv2d::BConv2DOptimizedIndirectBGEMM<DstScalar>(
+      kernel, conv_params, bitpacked_input_shape, output_shape,
+      output_transform, conv_params->packed_weights.data(),
+      conv_params->indirection_buffer.data(), GetTensorData<DstScalar>(output),
+      conv_params->padding_buffer.data(), conv_params->pad_value);
+}
+
+template <typename DstScalar>
+void EvalRef(TfLiteContext* context, TfLiteNode* node,
+             TfLiteBConv2DParams* params) {
   if (!params->one_time_setup_complete) {
     OneTimeSetup(context, node, params);
   }
 
+  const auto* input = GetInput(context, node, 0);
+  const auto* packed_filter = GetInput(context, node, 1);
+  auto* output = GetOutput(context, node, 0);
+
   // Using the standard TF Lite ConvParams struct.
   // This requires extra step of converting the TfLiteBConv2DParams
   // but unifies the interface with the default TF lite API for CONV params
@@ -399,7 +449,6 @@ void EvalRef(TfLiteContext* context, TfLiteNode* node,
   OutputTransform<DstScalar> output_transform;
   GetOutputTransform(output_transform, context, node, params);
 
-  TfLiteTensor* im2col = nullptr;
   core::bconv2d::BConv2DReference<std::int32_t, DstScalar>(
       op_params, GetTensorShape(input), GetTensorData<TBitpacked>(input),
       GetTensorShape(packed_filter), GetTensorData<TBitpacked>(packed_filter),
@@ -411,7 +460,7 @@ template <KernelType kernel_type, typename DstScalar>
 inline TfLiteStatus EvalChooseKernelType(TfLiteContext* context,
                                          TfLiteNode* node,
                                          TfLiteBConv2DParams* params) {
-  if (kernel_type == KernelType::kRuyOptimized) {
+  if (kernel_type == KernelType::kOptimizedBGEMM) {
 #if RUY_PLATFORM_ARM_64
     // On 64 bit Arm only there is an optimised kernel with 16-bit accumulators.
     // It is safe to use this without risk of overflow as long as the maximum
@@ -422,14 +471,17 @@ inline TfLiteStatus EvalChooseKernelType(TfLiteContext* context,
     const int depth =
         params->filter_height * params->filter_width * params->channels_in;
     if (depth + 512 < 1 << 16) {
-      EvalOpt<std::int16_t, DstScalar>(context, node, params);
+      EvalOptBGEMM<std::int16_t, DstScalar>(context, node, params);
       return kTfLiteOk;
     }
 #endif
     // In all other cases, use 32-bit accumulators.
-    EvalOpt<std::int32_t, DstScalar>(context, node, params);
+    EvalOptBGEMM<std::int32_t, DstScalar>(context, node, params);
+    return kTfLiteOk;
+  } else if (kernel_type == KernelType::kOptimizedIndirectBGEMM) {
+    EvalOptIndirectBGEMM<DstScalar>(context, node, params);
     return kTfLiteOk;
-  } else if (kernel_type == KernelType::kGenericRef) {
+  } else if (kernel_type == KernelType::kReference) {
     EvalRef<DstScalar>(context, node, params);
     return kTfLiteOk;
   }
@@ -456,23 +508,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteRegistration* Register_BCONV_2D_REF() {
   static TfLiteRegistration r = {
       bconv2d::Init, bconv2d::Free,
-      bconv2d::Prepare<bconv2d::KernelType::kGenericRef>,
-      bconv2d::Eval<bconv2d::KernelType::kGenericRef>};
+      bconv2d::Prepare<bconv2d::KernelType::kReference>,
+      bconv2d::Eval<bconv2d::KernelType::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_BCONV_2D_OPT_BGEMM() {
+  static TfLiteRegistration r = {
+      bconv2d::Init, bconv2d::Free,
+      bconv2d::Prepare<bconv2d::KernelType::kOptimizedBGEMM>,
+      bconv2d::Eval<bconv2d::KernelType::kOptimizedBGEMM>};
   return &r;
 }
 
-TfLiteRegistration* Register_BCONV_2D_OPT() {
+TfLiteRegistration* Register_BCONV_2D_OPT_INDIRECT_BGEMM() {
   static TfLiteRegistration r = {
       bconv2d::Init, bconv2d::Free,
-      bconv2d::Prepare<bconv2d::KernelType::kRuyOptimized>,
-      bconv2d::Eval<bconv2d::KernelType::kRuyOptimized>};
+      bconv2d::Prepare<bconv2d::KernelType::kOptimizedIndirectBGEMM>,
+      bconv2d::Eval<bconv2d::KernelType::kOptimizedIndirectBGEMM>};
   return &r;
 }
 
 // Use this registration wrapper to decide which implementation to use.
 TfLiteRegistration* Register_BCONV_2D() {
 #if defined TFLITE_WITH_RUY
-  return Register_BCONV_2D_OPT();
+  return Register_BCONV_2D_OPT_BGEMM();
 #else
   return Register_BCONV_2D_REF();
 #endif

diff --git a/larq_compute_engine/tflite/kernels/bconv2d_params.h b/larq_compute_engine/tflite/kernels/bconv2d_params.h
@@ -3,12 +3,15 @@
 
 #include <vector>
 
+#include "larq_compute_engine/core/types.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 
 namespace compute_engine {
 namespace tflite {
 namespace bconv2d {
 
+using core::TBitpacked;
+
 const int kTensorNotAllocated = -1;
 
 struct TfLiteBConv2DParams {
@@ -44,6 +47,11 @@ struct TfLiteBConv2DParams {
   // This is used when we have 'same-zero' padding.
   std::vector<float> padding_buffer;
 
+  // These are used in the 'indirect bgemm' kernels.
+  std::vector<TBitpacked> packed_weights;
+  std::vector<const TBitpacked*> indirection_buffer;
+  std::vector<TBitpacked> zero_buffer;
+
   // IDs are the arbitrary identifiers used by TF Lite to identify and access
   // memory buffers. They are unique in the entire TF Lite context.
   int im2col_id = kTensorNotAllocated;

diff --git a/larq_compute_engine/tflite/tests/bconv2d_test.cc b/larq_compute_engine/tflite/tests/bconv2d_test.cc
@@ -187,7 +187,8 @@ namespace compute_engine {
 namespace tflite {
 
 TfLiteRegistration* Register_BCONV_2D_REF();
-TfLiteRegistration* Register_BCONV_2D_OPT();
+TfLiteRegistration* Register_BCONV_2D_OPT_BGEMM();
+TfLiteRegistration* Register_BCONV_2D_OPT_INDIRECT_BGEMM();
 
 namespace testing {
 
@@ -272,12 +273,14 @@ struct TestParam {
 
   std::string kernel_name = "Unknown";
   register_function registration =
-      compute_engine::tflite::Register_BCONV_2D_OPT;
+      compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM;
 };
 
 const auto kKernelMap = new std::map<string, register_function>({
-    {"BConv2DREF", compute_engine::tflite::Register_BCONV_2D_REF},
-    {"BConv2DOPT", compute_engine::tflite::Register_BCONV_2D_OPT},
+    {"BConv2D_REF", compute_engine::tflite::Register_BCONV_2D_REF},
+    {"BConv2D_OPT_BGEMM", compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM},
+    {"BConv2D_OPT_INDIRECT_BGEMM",
+     compute_engine::tflite::Register_BCONV_2D_OPT_INDIRECT_BGEMM},
 });
 
 class BConv2DOpTest : public ::testing::TestWithParam<TestParamTuple> {
@@ -713,7 +716,8 @@ INSTANTIATE_TEST_SUITE_P(
                ActivationFunctionType_RELU),  // activation function
         Values(1, 2),                         // number of threads
         Values(std::pair<std::string, register_function>{
-            "BConv2DOPT", compute_engine::tflite::Register_BCONV_2D_OPT})),
+            "BConv2D_RUY_OPT_BGEMM",
+            compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM})),
     TestParam::TestNameSuffix);
 
 // The BigTest suite will be skipped in the qemu CI runs as they take more than
@@ -757,11 +761,11 @@ TEST(BConv2DTests, ReluErrorDeathTest) {
   // Test if fused ReLu throws an error in combination with zero-padding
   EXPECT_DEATH(
       {
-        FP_BConv2DOpModel m_lce(compute_engine::tflite::Register_BCONV_2D_OPT,
-                                input_tensor, packed_filter_tensor,
-                                output_tensor, post_tensor, post_tensor,
-                                threshold_tensor, 64, 1, 1, Padding_SAME, 0,
-                                ActivationFunctionType_RELU, 1, 1, 1);
+        FP_BConv2DOpModel m_lce(
+            compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM, input_tensor,
+            packed_filter_tensor, output_tensor, post_tensor, post_tensor,
+            threshold_tensor, 64, 1, 1, Padding_SAME, 0,
+            ActivationFunctionType_RELU, 1, 1, 1);
       },
       "Fused activations are only supported with valid or one-padding.");
 
@@ -770,7 +774,7 @@ TEST(BConv2DTests, ReluErrorDeathTest) {
   EXPECT_DEATH(
       {
         Bitpacked_BConv2DOpModel m_lce(
-            compute_engine::tflite::Register_BCONV_2D_OPT, input_tensor,
+            compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM, input_tensor,
             packed_filter_tensor, packed_output_tensor, post_tensor,
             post_tensor, threshold_tensor, 64, 1, 1, Padding_SAME, 0,
             ActivationFunctionType_NONE, 1, 1, 1);
@@ -793,11 +797,11 @@ TEST(BConv2DTests, Int8ErrorDeathTest) {
 
   EXPECT_DEATH(
       {
-        Int8_BConv2DOpModel m_lce(compute_engine::tflite::Register_BCONV_2D_OPT,
-                                  input_tensor, packed_filter_tensor,
-                                  output_tensor, post_tensor, post_tensor,
-                                  threshold_tensor, 64, 1, 1, Padding_SAME, 0,
-                                  ActivationFunctionType_NONE, 1, 1, 1);
+        Int8_BConv2DOpModel m_lce(
+            compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM, input_tensor,
+            packed_filter_tensor, output_tensor, post_tensor, post_tensor,
+            threshold_tensor, 64, 1, 1, Padding_SAME, 0,
+            ActivationFunctionType_NONE, 1, 1, 1);
       },
       "8-bit quantization is only supported with valid or one-padding.");
 }