Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optimised 'Indirect BGEMM' binary convolution kernels. #516

Merged
merged 1 commit into from
Sep 29, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Add optimised 'Indirect BGEMM' binary convolution kernels.
To start, add portable 4x2 C++ kernels for float/int8/bitpacked
output. Facilitate easy implementation of new indirect bgemm
kernels, including architecture-specific variations.
AdamHillier committed Sep 29, 2020
commit c057b6fa82c0695632142e815a3c65366b56a01e
21 changes: 19 additions & 2 deletions larq_compute_engine/core/bconv2d/BUILD
Original file line number Diff line number Diff line change
@@ -31,9 +31,9 @@ cc_library(
)

cc_library(
name = "optimized",
name = "optimized_bgemm",
hdrs = [
"optimized.h",
"optimized_bgemm.h",
],
deps = [
":zero_padding_correction",
@@ -45,3 +45,20 @@ cc_library(
"@ruy//ruy/profiler:instrumentation",
],
)

cc_library(
name = "optimized_indirect_bgemm",
hdrs = [
"optimized_indirect_bgemm.h",
],
deps = [
":zero_padding_correction",
"//larq_compute_engine/core/indirect_bgemm:kernels",
"//larq_compute_engine/core/indirect_bgemm:prepare",
"@org_tensorflow//tensorflow/lite/kernels:cpu_backend_context",
"@org_tensorflow//tensorflow/lite/kernels:cpu_backend_gemm",
"@org_tensorflow//tensorflow/lite/kernels:padding",
"@org_tensorflow//tensorflow/lite/kernels/internal:optimized_base",
"@ruy//ruy/profiler:instrumentation",
],
)
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_H_
#define COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_H_
#ifndef COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_BGEMM_H_
#define COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_BGEMM_H_

#include "larq_compute_engine/core/bconv2d/zero_padding_correction.h"
#include "larq_compute_engine/core/bgemm/bgemm.h"
@@ -61,7 +61,7 @@ inline void im2col(const ConvParams& params, const RuntimeShape& input_shape,
}

template <typename AccumScalar, typename DstScalar>
inline void BConv2DOptimized(
inline void BConv2DOptimizedBGEMM(
const ConvParams& params, const RuntimeShape& input_shape,
const TBitpacked* input_data, const RuntimeShape& filter_shape,
const TBitpacked* packed_filter_data,
@@ -152,6 +152,8 @@ inline void BConv2DOptimized(

if (std::is_same<DstScalar, float>::value &&
params.padding_type == PaddingType::kSame && pad_value == 0) {
ruy::profiler::ScopeLabel label("Zero padding correction");

const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
@@ -166,20 +168,17 @@ inline void BConv2DOptimized(
const int output_width = output_shape.Dims(2);
const int output_height = output_shape.Dims(1);

{
ruy::profiler::ScopeLabel label("Zero padding correction");
zero_padding_correction::ApplyCorrection(
batches, input_height, input_width, input_depth, filter_height,
filter_width, output_depth, stride_height, stride_width,
dilation_height_factor, dilation_width_factor,
reinterpret_cast<float*>(output_data), output_height, output_width,
padding_buffer);
}
zero_padding_correction::ApplyCorrection(
batches, input_height, input_width, input_depth, filter_height,
filter_width, output_depth, stride_height, stride_width,
dilation_height_factor, dilation_width_factor,
reinterpret_cast<float*>(output_data), output_height, output_width,
padding_buffer);
}
}

} // namespace bconv2d
} // namespace core
} // namespace compute_engine

#endif // COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_H_
#endif // COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_BGEMM_H_
68 changes: 68 additions & 0 deletions larq_compute_engine/core/bconv2d/optimized_indirect_bgemm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#ifndef COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_INDIRECT_BGEMM_H_
#define COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_INDIRECT_BGEMM_H_

#include "larq_compute_engine/core/bconv2d/zero_padding_correction.h"
#include "larq_compute_engine/core/indirect_bgemm/kernel.h"
#include "ruy/profiler/instrumentation.h"
#include "tensorflow/lite/kernels/internal/types.h"

namespace compute_engine {
namespace core {
namespace bconv2d {

template <typename AccumScalar, typename DstScalar>
inline void BConv2DOptimizedIndirectBGEMM(
const indirect_bgemm::IndirectBGEMMKernel<DstScalar> kernel,
const compute_engine::tflite::bconv2d::TfLiteBConv2DParams* conv_params,
const RuntimeShape& bitpacked_input_shape, const RuntimeShape& output_shape,
const OutputTransform<DstScalar>& output_transform,
const TBitpacked* packed_weights, const TBitpacked** indirection_buffer,
DstScalar* output_data, const float* padding_buffer, const int pad_value) {
TF_LITE_ASSERT_EQ(bitpacked_input_shape.DimensionsCount(), 4);
TF_LITE_ASSERT_EQ(output_shape.DimensionsCount(), 4);

ruy::profiler::ScopeLabel label("BConv2D (optimized, indirect BGEMM)");

const std::int32_t conv_kernel_size =
conv_params->filter_height * conv_params->filter_width;
const std::int32_t bitpacked_input_channels = bitpacked_input_shape.Dims(3);
const std::int32_t output_size = output_shape.Dims(1) * output_shape.Dims(2);
const std::int32_t output_channels = conv_params->channels_out;

indirect_bgemm::RunKernel(kernel, conv_kernel_size, bitpacked_input_channels,
output_size, output_channels, output_transform,
packed_weights, indirection_buffer, output_data);

if (std::is_same<DstScalar, float>::value &&
conv_params->padding_type == TfLitePadding::kTfLitePaddingSame &&
pad_value == 0) {
ruy::profiler::ScopeLabel label("Zero padding correction");

const int stride_width = conv_params->stride_width;
const int stride_height = conv_params->stride_height;
const int dilation_width_factor = conv_params->dilation_width_factor;
const int dilation_height_factor = conv_params->dilation_height_factor;
const int batches = MatchingDim(bitpacked_input_shape, 0, output_shape, 0);
const int input_depth = conv_params->channels_in;
const int input_width = bitpacked_input_shape.Dims(2);
const int input_height = bitpacked_input_shape.Dims(1);
const int filter_height = conv_params->filter_height;
const int filter_width = conv_params->filter_width;
const int output_depth = output_shape.Dims(3);
const int output_width = output_shape.Dims(2);
const int output_height = output_shape.Dims(1);

zero_padding_correction::ApplyCorrection(
batches, input_height, input_width, input_depth, filter_height,
filter_width, output_depth, stride_height, stride_width,
dilation_height_factor, dilation_width_factor,
reinterpret_cast<float*>(output_data), output_height, output_width,
padding_buffer);
}
}

} // namespace bconv2d
} // namespace core
} // namespace compute_engine

#endif // COMPUTE_ENGINE_CORE_BCONV2D_OPTIMIZED_INDIRECT_BGEMM_H_
30 changes: 30 additions & 0 deletions larq_compute_engine/core/indirect_bgemm/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
licenses(["notice"]) # Apache 2.0

package(default_visibility = ["//visibility:public"])

cc_library(
name = "prepare",
hdrs = [
"prepare.h",
],
deps = [
"//larq_compute_engine/core:types",
"//larq_compute_engine/tflite/kernels:bconv2d_params",
"@org_tensorflow//tensorflow/lite/kernels/internal:types",
],
)

cc_library(
name = "kernels",
hdrs = [
"kernel.h",
"kernel_4x2_portable.h",
],
deps = [
"//larq_compute_engine/core:types",
"//larq_compute_engine/core/bconv2d:output_transform",
"//larq_compute_engine/tflite/kernels:bconv2d_params",
"@org_tensorflow//tensorflow/lite/kernels/internal:types",
"@ruy//ruy/profiler:instrumentation",
],
)
79 changes: 79 additions & 0 deletions larq_compute_engine/core/indirect_bgemm/kernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@

#ifndef COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_H_
#define COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_H_

#include <cstdint>
#include <type_traits>

#include "larq_compute_engine/core/indirect_bgemm/kernel_4x2_portable.h"
#include "larq_compute_engine/core/types.h"
#include "larq_compute_engine/tflite/kernels/bconv2d_params.h"
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/kernels/internal/types.h"

using namespace tflite;

namespace compute_engine {
namespace core {
namespace indirect_bgemm {

using compute_engine::tflite::bconv2d::TfLiteBConv2DParams;

template <typename DstScalar>
struct IndirectBGEMMKernel {
using MicroKernelFunction = void(const std::int32_t, const std::int32_t,
const std::int32_t, const std::int32_t,
const bconv2d::OutputTransform<DstScalar>&,
const TBitpacked*, const TBitpacked**,
DstScalar*);
MicroKernelFunction* micro_kernel_function;
const std::int32_t block_size_output_channels;
const std::int32_t block_size_pixels;
};

// This function allows us to select which kernel to use at runtime based on any
// parameter we choose: destination scalar; conv params; input/output shapes;
// even detected CPU features.
// It is very important that this function is deterministic, as we rely on
// the fact that the same kernel is selected for each call to `Eval` (as long as
// the input shape doesn't change).
template <typename DstScalar>
inline IndirectBGEMMKernel<DstScalar> SelectRuntimeKernel(
const TfLiteBConv2DParams* conv_params,
const RuntimeShape& bitpacked_input_shape,
const RuntimeShape& output_shape) {
// For now there is only one kernel available.
return IndirectBGEMMKernel<DstScalar>{
&kernel_4x2_portable::RunKernel<DstScalar>, 4, 2};
}

template <typename DstScalar>
void RunKernel(const IndirectBGEMMKernel<DstScalar>& kernel,
const std::int32_t conv_kernel_size,
const std::int32_t bitpacked_input_channels,
const std::int32_t output_size,
const std::int32_t output_channels,
const bconv2d::OutputTransform<DstScalar>& output_transform,
const TBitpacked* packed_weights_ptr,
const TBitpacked** indirection_buffer, DstScalar* output_ptr) {
// TODO: implement multithreading here.
for (std::int32_t pixel_start = 0; pixel_start < output_size;
pixel_start += kernel.block_size_pixels) {
const std::int32_t output_stride =
std::is_same<DstScalar, TBitpacked>::value
? bitpacking::GetBitpackedSize(output_channels)
: output_channels;
kernel.micro_kernel_function(
std::min(output_size - pixel_start, kernel.block_size_pixels),
conv_kernel_size, bitpacked_input_channels, output_channels,
output_transform, packed_weights_ptr,
indirection_buffer + pixel_start * conv_kernel_size,
output_ptr + pixel_start * output_stride);
}
}

} // namespace indirect_bgemm
} // namespace core
} // namespace compute_engine

#endif // COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_H_
245 changes: 245 additions & 0 deletions larq_compute_engine/core/indirect_bgemm/kernel_4x2_portable.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
#ifndef COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_4x2_PORTABLE_H_
#define COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_4x2_PORTABLE_H_

#include <cstdint>
#include <type_traits>

#include "larq_compute_engine/core/bconv2d/output_transform.h"
#include "larq_compute_engine/core/types.h"
#include "ruy/profiler/instrumentation.h"
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/kernels/internal/types.h"

namespace compute_engine {
namespace core {
namespace indirect_bgemm {
namespace kernel_4x2_portable {

/**
* A 4x2 C++ micro-kernel for float or int8 output.
*/
template <typename DstScalar>
void RunKernel(const std::int32_t block_num_pixels,
const std::int32_t conv_kernel_size,
const std::int32_t channels_in, const std::int32_t channels_out,
const bconv2d::OutputTransform<DstScalar>& output_transform,
const TBitpacked* weights_ptr,
const TBitpacked** indirection_buffer, DstScalar* output_ptr) {
static_assert(std::is_same<DstScalar, float>::value ||
std::is_same<DstScalar, std::int8_t>::value,
"");

ruy::profiler::ScopeLabel label("Indirect BGEMM block (4x2, portable)");

TFLITE_DCHECK_GE(block_num_pixels, 1);
TFLITE_DCHECK_LE(block_num_pixels, 2);
TFLITE_DCHECK_GE(conv_kernel_size, 1);
TFLITE_DCHECK_GE(channels_in, 1);
TFLITE_DCHECK_GE(channels_out, 1);

DstScalar* output_ptr_0 = output_ptr;
DstScalar* output_ptr_1 = output_ptr + channels_out;

// At the end of the output array we might get a block where the number of
// pixels is less than 2, if the overall output size is not a multiple of 2.
// When this happens we set the 'leftover' output pointer equal to the first
// output pointer, so that there's no risk of writing beyond the array bounds.
// At the end, when we write to the output array, we do it 'back to front' so
// that the outputs for the first pixel are written last, which means that the
// result will still be correct.
if (block_num_pixels < 2) {
output_ptr_1 = output_ptr_0;
}

std::int32_t c_out_index = 0;
do {
// Accumulators
std::int32_t acc_00 = 0, acc_01 = 0;
std::int32_t acc_10 = 0, acc_11 = 0;
std::int32_t acc_20 = 0, acc_21 = 0;
std::int32_t acc_30 = 0, acc_31 = 0;

std::int32_t k_size_index = conv_kernel_size;
do {
const TBitpacked* activations_ptr_0 = indirection_buffer[0];
const TBitpacked* activations_ptr_1 = indirection_buffer[1];
indirection_buffer += 2;

std::int32_t c_in_index = channels_in;
do {
const TBitpacked w_0 = weights_ptr[0];
const TBitpacked w_1 = weights_ptr[1];
const TBitpacked w_2 = weights_ptr[2];
const TBitpacked w_3 = weights_ptr[3];
weights_ptr += 4;

const TBitpacked a_0 = *activations_ptr_0++;
const TBitpacked a_1 = *activations_ptr_1++;

acc_00 += xor_popcount(w_0, a_0);
acc_10 += xor_popcount(w_1, a_0);
acc_20 += xor_popcount(w_2, a_0);
acc_30 += xor_popcount(w_3, a_0);
acc_01 += xor_popcount(w_0, a_1);
acc_11 += xor_popcount(w_1, a_1);
acc_21 += xor_popcount(w_2, a_1);
acc_31 += xor_popcount(w_3, a_1);
} while (--c_in_index > 0);
} while (--k_size_index > 0);

if (channels_out - c_out_index >= 4) {
output_ptr_1[0] = output_transform.Run(acc_01, c_out_index);
output_ptr_1[1] = output_transform.Run(acc_11, c_out_index + 1);
output_ptr_1[2] = output_transform.Run(acc_21, c_out_index + 2);
output_ptr_1[3] = output_transform.Run(acc_31, c_out_index + 3);
output_ptr_1 += 4;
output_ptr_0[0] = output_transform.Run(acc_00, c_out_index);
output_ptr_0[1] = output_transform.Run(acc_10, c_out_index + 1);
output_ptr_0[2] = output_transform.Run(acc_20, c_out_index + 2);
output_ptr_0[3] = output_transform.Run(acc_30, c_out_index + 3);
output_ptr_0 += 4;

indirection_buffer -= 2 * conv_kernel_size;
c_out_index += 4;
} else {
if (channels_out - c_out_index >= 2) {
output_ptr_1[0] = output_transform.Run(acc_01, c_out_index);
output_ptr_1[1] = output_transform.Run(acc_11, c_out_index + 1);
output_ptr_1 += 2;
output_ptr_0[0] = output_transform.Run(acc_00, c_out_index);
output_ptr_0[1] = output_transform.Run(acc_10, c_out_index + 1);
output_ptr_0 += 2;

acc_01 = acc_21;
acc_00 = acc_20;
c_out_index += 2;
}
if (channels_out - c_out_index >= 1) {
output_ptr_1[0] = output_transform.Run(acc_01, c_out_index);
output_ptr_0[0] = output_transform.Run(acc_00, c_out_index);
}

c_out_index = channels_out;
}
} while (c_out_index < channels_out);
}

/**
* A 4x2 C++ micro-kernel for bitpacked output.
*/
template <>
void RunKernel<TBitpacked>(
const std::int32_t block_num_pixels, const std::int32_t conv_kernel_size,
const std::int32_t channels_in, const std::int32_t channels_out,
const bconv2d::OutputTransform<TBitpacked>& output_transform,
const TBitpacked* weights_ptr, const TBitpacked** indirection_buffer,
TBitpacked* output_ptr) {
ruy::profiler::ScopeLabel label("Indirect BGEMM block (4x2, portable)");

TFLITE_DCHECK_GE(block_num_pixels, 1);
TFLITE_DCHECK_LE(block_num_pixels, 2);
TFLITE_DCHECK_GE(conv_kernel_size, 1);
TFLITE_DCHECK_GE(channels_in, 1);
TFLITE_DCHECK_GE(channels_out, 1);

TBitpacked* output_ptr_0 = output_ptr;
TBitpacked* output_ptr_1 =
output_ptr + bitpacking::GetBitpackedSize(channels_out);

// At the end of the output array we might get a block where the number of
// pixels is less than 2, if the overall output size is not a multiple of 2.
// When this happens we set the 'leftover' output pointer equal to the first
// output pointer, so that there's no risk of writing beyond the array bounds.
// At the end, when we write to the output array, we do it 'back to front' so
// that the outputs for the first pixel are written last, which means that the
// result will still be correct.
if (block_num_pixels < 2) {
output_ptr_1 = output_ptr_0;
}

// We will accumulate bits into these per-pixel columns and write a bitpacked
// value when the columns are full.
TBitpacked output_col_0 = 0, output_col_1 = 0;

std::int32_t c_out_index = 0;
do {
// Accumulators
std::int32_t acc_00 = 0, acc_01 = 0;
std::int32_t acc_10 = 0, acc_11 = 0;
std::int32_t acc_20 = 0, acc_21 = 0;
std::int32_t acc_30 = 0, acc_31 = 0;

std::int32_t k_size_index = conv_kernel_size;
do {
const TBitpacked* activations_ptr_0 = indirection_buffer[0];
const TBitpacked* activations_ptr_1 = indirection_buffer[1];
indirection_buffer += 2;

std::int32_t c_in_index = channels_in;
do {
const TBitpacked w_0 = weights_ptr[0];
const TBitpacked w_1 = weights_ptr[1];
const TBitpacked w_2 = weights_ptr[2];
const TBitpacked w_3 = weights_ptr[3];
weights_ptr += 4;

const TBitpacked a_0 = *activations_ptr_0++;
const TBitpacked a_1 = *activations_ptr_1++;

acc_00 += xor_popcount(w_0, a_0);
acc_10 += xor_popcount(w_1, a_0);
acc_20 += xor_popcount(w_2, a_0);
acc_30 += xor_popcount(w_3, a_0);
acc_01 += xor_popcount(w_0, a_1);
acc_11 += xor_popcount(w_1, a_1);
acc_21 += xor_popcount(w_2, a_1);
acc_31 += xor_popcount(w_3, a_1);
} while (--c_in_index > 0);
} while (--k_size_index > 0);

output_col_0 |= TBitpacked(output_transform.Run(acc_00, c_out_index))
<< (c_out_index % bitpacking_bitwidth);
output_col_0 |= TBitpacked(output_transform.Run(acc_10, c_out_index + 1))
<< ((c_out_index + 1) % bitpacking_bitwidth);
output_col_0 |= TBitpacked(output_transform.Run(acc_20, c_out_index + 2))
<< ((c_out_index + 2) % bitpacking_bitwidth);
output_col_0 |= TBitpacked(output_transform.Run(acc_30, c_out_index + 3))
<< ((c_out_index + 3) % bitpacking_bitwidth);
output_col_1 |= TBitpacked(output_transform.Run(acc_01, c_out_index))
<< (c_out_index % bitpacking_bitwidth);
output_col_1 |= TBitpacked(output_transform.Run(acc_11, c_out_index + 1))
<< ((c_out_index + 1) % bitpacking_bitwidth);
output_col_1 |= TBitpacked(output_transform.Run(acc_21, c_out_index + 2))
<< ((c_out_index + 2) % bitpacking_bitwidth);
output_col_1 |= TBitpacked(output_transform.Run(acc_31, c_out_index + 3))
<< ((c_out_index + 3) % bitpacking_bitwidth);

indirection_buffer -= 2 * conv_kernel_size;
c_out_index += 4;

// Write the bitpacked columns whenever they are full, or if we've computed
// the last output column value.
if (c_out_index % bitpacking_bitwidth == 0 || c_out_index >= channels_out) {
// If this is a 'leftover output channel' block (because the number of
// output channels isn't a multiple of four) then zero-out the extra bits.
if (c_out_index % bitpacking_bitwidth != 0) {
output_col_0 &=
(TBitpacked(1) << (channels_out % bitpacking_bitwidth)) - 1;
output_col_1 &=
(TBitpacked(1) << (channels_out % bitpacking_bitwidth)) - 1;
}

*output_ptr_1++ = output_col_1;
output_col_1 = 0;
*output_ptr_0++ = output_col_0;
output_col_0 = 0;
}
} while (c_out_index < channels_out);
}

} // namespace kernel_4x2_portable
} // namespace indirect_bgemm
} // namespace core
} // namespace compute_engine

#endif // COMPUTE_ENGINE_INDIRECT_BGEMM_KERNEL_4x2_PORTABLE_H_
145 changes: 145 additions & 0 deletions larq_compute_engine/core/indirect_bgemm/prepare.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#ifndef COMPUTE_ENGINE_INDIRECT_BGEMM_PREPARE_H_
#define COMPUTE_ENGINE_INDIRECT_BGEMM_PREPARE_H_

#include <cstdint>

#include "larq_compute_engine/core/types.h"
#include "larq_compute_engine/tflite/kernels/bconv2d_params.h"
#include "tensorflow/lite/c/builtin_op_data.h"
#include "tensorflow/lite/kernels/internal/types.h"

using namespace tflite;

namespace compute_engine {
namespace core {
namespace indirect_bgemm {

// This function is (heavily) adapted from this XNNPack function:
// https://github.com/google/XNNPACK/blob/80a8ac59849bfdae8d2e1409f5642baa502c0b9e/src/indirection.c#L18-L76
void FillIndirectionBuffer(const int block_size_pixels,
const TfLiteBConv2DParams* conv_params,
const RuntimeShape& bitpacked_input_shape,
const RuntimeShape& output_shape,
const TBitpacked* input_ptr,
std::vector<const TBitpacked*>& indirection_buffer,
std::vector<TBitpacked>& zero_buffer) {
using std::int32_t;

const int32_t kernel_height = conv_params->filter_height;
const int32_t kernel_width = conv_params->filter_width;
const int32_t stride_height = conv_params->stride_height;
const int32_t stride_width = conv_params->stride_width;
const int32_t dilation_height = conv_params->dilation_height_factor;
const int32_t dilation_width = conv_params->dilation_width_factor;
const int32_t input_padding_top = conv_params->padding_values.height;
const int32_t input_padding_left = conv_params->padding_values.width;

const int32_t input_height = bitpacked_input_shape.Dims(1);
const int32_t input_width = bitpacked_input_shape.Dims(2);
const int32_t bitpacked_input_channels = bitpacked_input_shape.Dims(3);

const int32_t output_height = output_shape.Dims(1);
const int32_t output_width = output_shape.Dims(2);

const int32_t output_size = output_height * output_width;
const int32_t kernel_size = kernel_height * kernel_width;
const int32_t tiled_output_size =
block_size_pixels *
((output_size + block_size_pixels - 1) / block_size_pixels);

indirection_buffer.resize(tiled_output_size * kernel_size);
zero_buffer.assign(kernel_size * bitpacked_input_channels, 0);

for (int32_t output_tile_start = 0; output_tile_start < tiled_output_size;
output_tile_start += block_size_pixels) {
for (int32_t output_tile_offset = 0; output_tile_offset < block_size_pixels;
output_tile_offset++) {
const int32_t output_index =
std::min(output_tile_start + output_tile_offset, output_size - 1);
const int32_t output_x = output_index % output_width;
const int32_t output_y = output_index / output_width;
for (int32_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
const int32_t input_y = output_y * stride_height +
kernel_y * dilation_height - input_padding_top;
if (0 <= input_y && input_y < input_height) {
for (int32_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
const int32_t input_x = output_x * stride_width +
kernel_x * dilation_width -
input_padding_left;
const int32_t kernel_index = kernel_y * kernel_width + kernel_x;
const int32_t index = output_tile_start * kernel_size +
kernel_index * block_size_pixels +
output_tile_offset;
if (0 <= input_x && input_x < input_width) {
indirection_buffer.at(index) =
(input_ptr + (input_y * input_width + input_x) *
bitpacked_input_channels);
} else {
indirection_buffer.at(index) = zero_buffer.data();
}
}
} else {
for (int32_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
const int32_t kernel_index = kernel_y * kernel_width + kernel_x;
const int32_t index = output_tile_start * kernel_size +
kernel_index * block_size_pixels +
output_tile_offset;
indirection_buffer.at(index) = zero_buffer.data();
}
}
}
}
}
}

// This function is (heavily) adapted from this XNNPack function:
// https://github.com/google/XNNPACK/blob/80a8ac59849bfdae8d2e1409f5642baa502c0b9e/src/packing.c#L429-L484
void PackWeights(const int block_size_output_channels,
const TfLiteBConv2DParams* conv_params,
const RuntimeShape& bitpacked_input_shape,
const RuntimeShape& output_shape,
const TBitpacked* weights_ptr,
std::vector<TBitpacked>& packed_weights) {
using std::int32_t;

const int32_t bitpacked_input_channels = bitpacked_input_shape.Dims(3);
const int32_t output_channels = conv_params->channels_out;
const int32_t kernel_size =
conv_params->filter_height * conv_params->filter_width;

const int32_t rounded_up_output_channels =
block_size_output_channels *
((output_channels + block_size_output_channels - 1) /
block_size_output_channels);

packed_weights.resize(rounded_up_output_channels * kernel_size *
bitpacked_input_channels);

int32_t packed_weights_index = 0;

for (int32_t block_start = 0; block_start < output_channels;
block_start += block_size_output_channels) {
const int32_t block_size =
std::min(output_channels - block_start, block_size_output_channels);
for (int32_t ki = 0; ki < kernel_size; ki++) {
for (int32_t ci = 0; ci < bitpacked_input_channels; ci++) {
for (int32_t block_offset = 0; block_offset < block_size;
block_offset++) {
const int32_t weights_index = (block_start + block_offset) *
kernel_size *
bitpacked_input_channels +
ki * bitpacked_input_channels + ci;
packed_weights.at(packed_weights_index++) =
weights_ptr[weights_index];
}
packed_weights_index += block_size_output_channels - block_size;
}
}
}
}

} // namespace indirect_bgemm
} // namespace core
} // namespace compute_engine

#endif // COMPUTE_ENGINE_INDIRECT_BGEMM_PREPARE_H_
8 changes: 7 additions & 1 deletion larq_compute_engine/tflite/kernels/BUILD
Original file line number Diff line number Diff line change
@@ -12,6 +12,9 @@ cc_library(
hdrs = [
"bconv2d_params.h",
],
deps = [
"//larq_compute_engine/core:types",
],
)

cc_library(
@@ -57,10 +60,13 @@ cc_library(
":bconv2d_params",
":utils",
"//larq_compute_engine/core:bmaxpool",
"//larq_compute_engine/core/bconv2d:optimized",
"//larq_compute_engine/core/bconv2d:optimized_bgemm",
"//larq_compute_engine/core/bconv2d:optimized_indirect_bgemm",
"//larq_compute_engine/core/bconv2d:reference",
"//larq_compute_engine/core/bitpacking:bitpack",
"//larq_compute_engine/core/bitpacking:utils",
"//larq_compute_engine/core/indirect_bgemm:kernels",
"//larq_compute_engine/core/indirect_bgemm:prepare",
"@flatbuffers",
"@org_tensorflow//tensorflow/lite:framework",
"@org_tensorflow//tensorflow/lite/kernels/internal:kernel_utils",
108 changes: 84 additions & 24 deletions larq_compute_engine/tflite/kernels/bconv2d.cc
Original file line number Diff line number Diff line change
@@ -2,9 +2,12 @@
#include <cstdint>

#include "flatbuffers/flexbuffers.h"
#include "larq_compute_engine/core/bconv2d/optimized.h"
#include "larq_compute_engine/core/bconv2d/optimized_bgemm.h"
#include "larq_compute_engine/core/bconv2d/optimized_indirect_bgemm.h"
#include "larq_compute_engine/core/bconv2d/reference.h"
#include "larq_compute_engine/core/bconv2d/zero_padding_correction.h"
#include "larq_compute_engine/core/indirect_bgemm/kernel.h"
#include "larq_compute_engine/core/indirect_bgemm/prepare.h"
#include "larq_compute_engine/core/types.h"
#include "larq_compute_engine/tflite/kernels/bconv2d_output_transform_utils.h"
#include "larq_compute_engine/tflite/kernels/bconv2d_params.h"
@@ -25,11 +28,14 @@ namespace bconv2d {
using namespace core::bitpacking;

enum class KernelType {
// kGenericRef: the reference implementation without im2col
kGenericRef,
// The reference implementation with for-loops.
kReference,

// kRuyOptimized: the Ruy implementation with hand-optimized BGEMM kernels.
kRuyOptimized,
// The Ruy implementation with im2col and hand-optimized BGEMM kernels.
kOptimizedBGEMM,

// The XNNPack-derived implementation with indirect BGEMM kernels.
kOptimizedIndirectBGEMM,
};

#define LCE_ENSURE_PARAM(conv_params, context, a) \
@@ -171,12 +177,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
"8-bit quantization is only supported with valid or one-padding");
}

if (kernel_type == KernelType::kGenericRef) {
if (kernel_type == KernelType::kReference) {
TF_LITE_ENSURE_MSG(
context,
conv_params->padding_type != kTfLitePaddingSame ||
conv_params->pad_value == 1,
"The reference kernel only supports valid or one-padding.");
} else if (kernel_type == KernelType::kOptimizedIndirectBGEMM) {
TF_LITE_ENSURE_MSG(
context, input->allocation_type != kTfLiteDynamic,
"The input tensor must not have dynamic allocation type");
}

// Determine the output dimensions and allocate the output buffer
@@ -193,7 +203,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
int temporaries_count = 0;

const bool need_im2col =
kernel_type == KernelType::kRuyOptimized &&
kernel_type == KernelType::kOptimizedBGEMM &&
(conv_params->stride_width != 1 || conv_params->stride_height != 1 ||
conv_params->dilation_width_factor != 1 ||
conv_params->dilation_height_factor != 1 ||
@@ -335,8 +345,8 @@ void OneTimeSetup(TfLiteContext* context, TfLiteNode* node,
}

template <typename AccumScalar, typename DstScalar>
void EvalOpt(TfLiteContext* context, TfLiteNode* node,
TfLiteBConv2DParams* params) {
void EvalOptBGEMM(TfLiteContext* context, TfLiteNode* node,
TfLiteBConv2DParams* params) {
if (!params->one_time_setup_complete) {
OneTimeSetup(context, node, params);
}
@@ -369,7 +379,7 @@ void EvalOpt(TfLiteContext* context, TfLiteNode* node,
// weights data.
// Likewise, we pass the original output shape even if we are going to
// write bitpacked output directly.
core::bconv2d::BConv2DOptimized<AccumScalar, DstScalar>(
core::bconv2d::BConv2DOptimizedBGEMM<AccumScalar, DstScalar>(
op_params, GetTensorShape(input), GetTensorData<TBitpacked>(input),
GetTensorShape(filter), GetTensorData<TBitpacked>(filter),
output_transform, unpacked_output_shape, GetTensorData<DstScalar>(output),
@@ -379,16 +389,56 @@ void EvalOpt(TfLiteContext* context, TfLiteNode* node,
}

template <typename DstScalar>
void EvalRef(TfLiteContext* context, TfLiteNode* node,
TfLiteBConv2DParams* params) {
void EvalOptIndirectBGEMM(TfLiteContext* context, TfLiteNode* node,
TfLiteBConv2DParams* conv_params) {
bool kernel_is_initialized = true;
if (!conv_params->one_time_setup_complete) {
OneTimeSetup(context, node, conv_params);
kernel_is_initialized = false;
}

const auto* input = GetInput(context, node, 0);
const auto* packed_filter = GetInput(context, node, 1);
const auto* filter = GetInput(context, node, 1);
auto* output = GetOutput(context, node, 0);

const auto bitpacked_input_shape = GetTensorShape(input);
const auto output_shape = GetTensorShape(output);

const auto kernel = core::indirect_bgemm::SelectRuntimeKernel<DstScalar>(
conv_params, bitpacked_input_shape, output_shape);

OutputTransform<DstScalar> output_transform;
GetOutputTransform(output_transform, context, node, conv_params);

if (!kernel_is_initialized) {
core::indirect_bgemm::FillIndirectionBuffer(
kernel.block_size_pixels, conv_params, bitpacked_input_shape,
output_shape, GetTensorData<TBitpacked>(input),
conv_params->indirection_buffer, conv_params->zero_buffer);
core::indirect_bgemm::PackWeights(
kernel.block_size_output_channels, conv_params, bitpacked_input_shape,
output_shape, GetTensorData<TBitpacked>(filter),
conv_params->packed_weights);
}

core::bconv2d::BConv2DOptimizedIndirectBGEMM<DstScalar>(
kernel, conv_params, bitpacked_input_shape, output_shape,
output_transform, conv_params->packed_weights.data(),
conv_params->indirection_buffer.data(), GetTensorData<DstScalar>(output),
conv_params->padding_buffer.data(), conv_params->pad_value);
}

template <typename DstScalar>
void EvalRef(TfLiteContext* context, TfLiteNode* node,
TfLiteBConv2DParams* params) {
if (!params->one_time_setup_complete) {
OneTimeSetup(context, node, params);
}

const auto* input = GetInput(context, node, 0);
const auto* packed_filter = GetInput(context, node, 1);
auto* output = GetOutput(context, node, 0);

// Using the standard TF Lite ConvParams struct.
// This requires extra step of converting the TfLiteBConv2DParams
// but unifies the interface with the default TF lite API for CONV params
@@ -399,7 +449,6 @@ void EvalRef(TfLiteContext* context, TfLiteNode* node,
OutputTransform<DstScalar> output_transform;
GetOutputTransform(output_transform, context, node, params);

TfLiteTensor* im2col = nullptr;
core::bconv2d::BConv2DReference<std::int32_t, DstScalar>(
op_params, GetTensorShape(input), GetTensorData<TBitpacked>(input),
GetTensorShape(packed_filter), GetTensorData<TBitpacked>(packed_filter),
@@ -411,7 +460,7 @@ template <KernelType kernel_type, typename DstScalar>
inline TfLiteStatus EvalChooseKernelType(TfLiteContext* context,
TfLiteNode* node,
TfLiteBConv2DParams* params) {
if (kernel_type == KernelType::kRuyOptimized) {
if (kernel_type == KernelType::kOptimizedBGEMM) {
#if RUY_PLATFORM_ARM_64
// On 64 bit Arm only there is an optimised kernel with 16-bit accumulators.
// It is safe to use this without risk of overflow as long as the maximum
@@ -422,14 +471,17 @@ inline TfLiteStatus EvalChooseKernelType(TfLiteContext* context,
const int depth =
params->filter_height * params->filter_width * params->channels_in;
if (depth + 512 < 1 << 16) {
EvalOpt<std::int16_t, DstScalar>(context, node, params);
EvalOptBGEMM<std::int16_t, DstScalar>(context, node, params);
return kTfLiteOk;
}
#endif
// In all other cases, use 32-bit accumulators.
EvalOpt<std::int32_t, DstScalar>(context, node, params);
EvalOptBGEMM<std::int32_t, DstScalar>(context, node, params);
return kTfLiteOk;
} else if (kernel_type == KernelType::kOptimizedIndirectBGEMM) {
EvalOptIndirectBGEMM<DstScalar>(context, node, params);
return kTfLiteOk;
} else if (kernel_type == KernelType::kGenericRef) {
} else if (kernel_type == KernelType::kReference) {
EvalRef<DstScalar>(context, node, params);
return kTfLiteOk;
}
@@ -456,23 +508,31 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
TfLiteRegistration* Register_BCONV_2D_REF() {
static TfLiteRegistration r = {
bconv2d::Init, bconv2d::Free,
bconv2d::Prepare<bconv2d::KernelType::kGenericRef>,
bconv2d::Eval<bconv2d::KernelType::kGenericRef>};
bconv2d::Prepare<bconv2d::KernelType::kReference>,
bconv2d::Eval<bconv2d::KernelType::kReference>};
return &r;
}

TfLiteRegistration* Register_BCONV_2D_OPT_BGEMM() {
static TfLiteRegistration r = {
bconv2d::Init, bconv2d::Free,
bconv2d::Prepare<bconv2d::KernelType::kOptimizedBGEMM>,
bconv2d::Eval<bconv2d::KernelType::kOptimizedBGEMM>};
return &r;
}

TfLiteRegistration* Register_BCONV_2D_OPT() {
TfLiteRegistration* Register_BCONV_2D_OPT_INDIRECT_BGEMM() {
static TfLiteRegistration r = {
bconv2d::Init, bconv2d::Free,
bconv2d::Prepare<bconv2d::KernelType::kRuyOptimized>,
bconv2d::Eval<bconv2d::KernelType::kRuyOptimized>};
bconv2d::Prepare<bconv2d::KernelType::kOptimizedIndirectBGEMM>,
bconv2d::Eval<bconv2d::KernelType::kOptimizedIndirectBGEMM>};
return &r;
}

// Use this registration wrapper to decide which implementation to use.
TfLiteRegistration* Register_BCONV_2D() {
#if defined TFLITE_WITH_RUY
return Register_BCONV_2D_OPT();
return Register_BCONV_2D_OPT_BGEMM();
#else
return Register_BCONV_2D_REF();
#endif
8 changes: 8 additions & 0 deletions larq_compute_engine/tflite/kernels/bconv2d_params.h
Original file line number Diff line number Diff line change
@@ -3,12 +3,15 @@

#include <vector>

#include "larq_compute_engine/core/types.h"
#include "tensorflow/lite/c/builtin_op_data.h"

namespace compute_engine {
namespace tflite {
namespace bconv2d {

using core::TBitpacked;

const int kTensorNotAllocated = -1;

struct TfLiteBConv2DParams {
@@ -44,6 +47,11 @@ struct TfLiteBConv2DParams {
// This is used when we have 'same-zero' padding.
std::vector<float> padding_buffer;

// These are used in the 'indirect bgemm' kernels.
std::vector<TBitpacked> packed_weights;
std::vector<const TBitpacked*> indirection_buffer;
std::vector<TBitpacked> zero_buffer;

// IDs are the arbitrary identifiers used by TF Lite to identify and access
// memory buffers. They are unique in the entire TF Lite context.
int im2col_id = kTensorNotAllocated;
36 changes: 20 additions & 16 deletions larq_compute_engine/tflite/tests/bconv2d_test.cc
Original file line number Diff line number Diff line change
@@ -187,7 +187,8 @@ namespace compute_engine {
namespace tflite {

TfLiteRegistration* Register_BCONV_2D_REF();
TfLiteRegistration* Register_BCONV_2D_OPT();
TfLiteRegistration* Register_BCONV_2D_OPT_BGEMM();
TfLiteRegistration* Register_BCONV_2D_OPT_INDIRECT_BGEMM();

namespace testing {

@@ -272,12 +273,14 @@ struct TestParam {

std::string kernel_name = "Unknown";
register_function registration =
compute_engine::tflite::Register_BCONV_2D_OPT;
compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM;
};

const auto kKernelMap = new std::map<string, register_function>({
{"BConv2DREF", compute_engine::tflite::Register_BCONV_2D_REF},
{"BConv2DOPT", compute_engine::tflite::Register_BCONV_2D_OPT},
{"BConv2D_REF", compute_engine::tflite::Register_BCONV_2D_REF},
{"BConv2D_OPT_BGEMM", compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM},
{"BConv2D_OPT_INDIRECT_BGEMM",
compute_engine::tflite::Register_BCONV_2D_OPT_INDIRECT_BGEMM},
});

class BConv2DOpTest : public ::testing::TestWithParam<TestParamTuple> {
@@ -713,7 +716,8 @@ INSTANTIATE_TEST_SUITE_P(
ActivationFunctionType_RELU), // activation function
Values(1, 2), // number of threads
Values(std::pair<std::string, register_function>{
"BConv2DOPT", compute_engine::tflite::Register_BCONV_2D_OPT})),
"BConv2D_RUY_OPT_BGEMM",
compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM})),
TestParam::TestNameSuffix);

// The BigTest suite will be skipped in the qemu CI runs as they take more than
@@ -757,11 +761,11 @@ TEST(BConv2DTests, ReluErrorDeathTest) {
// Test if fused ReLu throws an error in combination with zero-padding
EXPECT_DEATH(
{
FP_BConv2DOpModel m_lce(compute_engine::tflite::Register_BCONV_2D_OPT,
input_tensor, packed_filter_tensor,
output_tensor, post_tensor, post_tensor,
threshold_tensor, 64, 1, 1, Padding_SAME, 0,
ActivationFunctionType_RELU, 1, 1, 1);
FP_BConv2DOpModel m_lce(
compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM, input_tensor,
packed_filter_tensor, output_tensor, post_tensor, post_tensor,
threshold_tensor, 64, 1, 1, Padding_SAME, 0,
ActivationFunctionType_RELU, 1, 1, 1);
},
"Fused activations are only supported with valid or one-padding.");

@@ -770,7 +774,7 @@ TEST(BConv2DTests, ReluErrorDeathTest) {
EXPECT_DEATH(
{
Bitpacked_BConv2DOpModel m_lce(
compute_engine::tflite::Register_BCONV_2D_OPT, input_tensor,
compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM, input_tensor,
packed_filter_tensor, packed_output_tensor, post_tensor,
post_tensor, threshold_tensor, 64, 1, 1, Padding_SAME, 0,
ActivationFunctionType_NONE, 1, 1, 1);
@@ -793,11 +797,11 @@ TEST(BConv2DTests, Int8ErrorDeathTest) {

EXPECT_DEATH(
{
Int8_BConv2DOpModel m_lce(compute_engine::tflite::Register_BCONV_2D_OPT,
input_tensor, packed_filter_tensor,
output_tensor, post_tensor, post_tensor,
threshold_tensor, 64, 1, 1, Padding_SAME, 0,
ActivationFunctionType_NONE, 1, 1, 1);
Int8_BConv2DOpModel m_lce(
compute_engine::tflite::Register_BCONV_2D_OPT_BGEMM, input_tensor,
packed_filter_tensor, output_tensor, post_tensor, post_tensor,
threshold_tensor, 64, 1, 1, Padding_SAME, 0,
ActivationFunctionType_NONE, 1, 1, 1);
},
"8-bit quantization is only supported with valid or one-padding.");
}