Skip to content

Commit

Permalink
Remove u32-f32-cvt kernels
Browse files Browse the repository at this point in the history
We don't really properly support uint32 datatypes. This is only used for converting reduction outputs back to qs8. However, we don't do this via uint32 anywhere else (e.g. gemms), and I don't think we can properly support it.

Using s32 instead just means accumulators are effectively 31 bits instead of 32, which seems insignificant.

PiperOrigin-RevId: 694617584
  • Loading branch information
dsharletg authored and xnnpack-bot committed Nov 8, 2024
1 parent 6d50b79 commit f81907b
Show file tree
Hide file tree
Showing 45 changed files with 10 additions and 2,545 deletions.
5 changes: 0 additions & 5 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ load(
"xnnpack_simd_s16_archs",
"xnnpack_simd_s32_archs",
"xnnpack_simd_s8_archs",
"xnnpack_simd_u32_archs",
)
load(
"//gen:microkernels.bzl",
Expand Down Expand Up @@ -189,7 +188,6 @@ MICROKERNEL_DEFS = [
"src/s32-f32-vcvt/s32-f32-vcvt.h",
"src/u8-maxpool/u8-maxpool-minmax.h",
"src/u8-vclamp/u8-vclamp.h",
"src/u32-f32-vcvt/u32-f32-vcvt.h",
"src/xx-fill/xx-fill.h",
"src/xx-pad/xx-pad.h",
"src/xx-transposev/xx-transposev.h",
Expand Down Expand Up @@ -275,9 +273,6 @@ SIMD_HEADERS = [
] + [
"src/xnnpack/simd/s8-" + arch + ".h"
for arch in xnnpack_simd_s8_archs()
] + [
"src/xnnpack/simd/u32-" + arch + ".h"
for arch in xnnpack_simd_u32_archs()
]

exports_files(SIMD_HEADERS)
Expand Down
3 changes: 1 addition & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1679,8 +1679,7 @@ IF(XNNPACK_BUILD_TESTS)
qs8-f32-vcvt
qs8-vcvt
qu8-f32-vcvt
qu8-vcvt
u32-f32-vcvt)
qu8-vcvt)
FOREACH(TEST ${MICROKERNEL_VCVT_TESTS})
ADD_EXECUTABLE(${TEST}-test test/${TEST}.cc)
TARGET_INCLUDE_DIRECTORIES(${TEST}-test PRIVATE include src test)
Expand Down
1 change: 0 additions & 1 deletion bench/vunary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,6 @@ void vlrelu(benchmark::State& state, uint64_t arch_flags,
#include "qu8-f32-vcvt/qu8-f32-vcvt.h"
#include "qu8-vcvt/qu8-vcvt.h"
#include "s32-f32-vcvt/s32-f32-vcvt.h"
#include "u32-f32-vcvt/u32-f32-vcvt.h"
#undef XNN_CVT_UKERNEL_WITH_PARAMS

#ifndef XNNPACK_BENCHMARK_NO_MAIN
Expand Down
3 changes: 0 additions & 3 deletions build_params.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,6 @@ def xnnpack_simd_s16_archs():
def xnnpack_simd_s32_archs():
return ["avx2", "avx512f", "neon", "scalar", "sse41", "hvx", "wasmsimd"]

def xnnpack_simd_u32_archs():
return ["avx2", "avx512f", "neon", "scalar", "sse41", "wasmsimd"]

def xnnpack_simd_s8_archs():
return ["scalar"]

Expand Down
1 change: 0 additions & 1 deletion cmake/gen/avx2_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ SET(PROD_AVX2_MICROKERNEL_SRCS
src/s8-vclamp/s8-vclamp-avx2-u128.c
src/s32-f32-vcvt/gen/s32-f32-vcvt-avx2.c
src/u8-vclamp/u8-vclamp-avx2-u128.c
src/u32-f32-vcvt/gen/u32-f32-vcvt-avx2.c
src/x8-lut/gen/x8-lut-avx2-u128.c
src/x8-transposec/gen/x8-transposec-32x32-reuse-switch-avx2.c
src/x16-packw/gen/x16-packw-x16-gemm-goi-avx2-u16-prfm.c
Expand Down
1 change: 0 additions & 1 deletion cmake/gen/avx512f_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ SET(PROD_AVX512F_MICROKERNEL_SRCS
src/f32-vunary/gen/f32-vneg-avx512f.c
src/f32-vunary/gen/f32-vsqr-avx512f.c
src/s32-f32-vcvt/gen/s32-f32-vcvt-avx512f.c
src/u32-f32-vcvt/gen/u32-f32-vcvt-avx512f.c
src/x32-packw/gen/x32-packw-x32-gemm-goi-avx512f-u4-prfm.c)

SET(NON_PROD_AVX512F_MICROKERNEL_SRCS
Expand Down
1 change: 0 additions & 1 deletion cmake/gen/neon_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS
src/u8-maxpool/u8-maxpool-9p8x-minmax-neon-c16.c
src/u8-rmax/u8-rmax-neon-u16.c
src/u8-vclamp/u8-vclamp-neon-u64.c
src/u32-f32-vcvt/gen/u32-f32-vcvt-neon.c
src/x8-transposec/gen/x8-transposec-16x16-reuse-dec-zip-neon.c
src/x8-zip/x8-zip-x2-neon.c
src/x8-zip/x8-zip-x3-neon.c
Expand Down
1 change: 0 additions & 1 deletion cmake/gen/scalar_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS
src/u8-maxpool/u8-maxpool-9p8x-minmax-scalar-c1.c
src/u8-rmax/u8-rmax-scalar-u2.c
src/u8-vclamp/u8-vclamp-scalar-u4.c
src/u32-f32-vcvt/gen/u32-f32-vcvt-scalar.c
src/x8-lut/gen/x8-lut-scalar-u4.c
src/x8-packq/x8-packq-scalar-f32qp8-u1.c
src/x8-packw/gen/x8-packw-x4-gemm-goi-scalar-u2.c
Expand Down
1 change: 0 additions & 1 deletion cmake/gen/wasmsimd_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,6 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS
src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c8.c
src/u8-maxpool/u8-maxpool-9p8x-minmax-wasmsimd-c16.c
src/u8-vclamp/u8-vclamp-wasmsimd-u64.c
src/u32-f32-vcvt/gen/u32-f32-vcvt-wasmsimd.c
src/x8-lut/gen/x8-lut-wasmsimd-u32.c
src/x8-transposec/gen/x8-transposec-16x16-reuse-mov-wasmsimd.c
src/x16-transposec/gen/x16-transposec-8x8-reuse-mov-wasmsimd.c
Expand Down
1 change: 0 additions & 1 deletion gen/avx2_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ PROD_AVX2_MICROKERNEL_SRCS = [
"src/s8-vclamp/s8-vclamp-avx2-u128.c",
"src/s32-f32-vcvt/gen/s32-f32-vcvt-avx2.c",
"src/u8-vclamp/u8-vclamp-avx2-u128.c",
"src/u32-f32-vcvt/gen/u32-f32-vcvt-avx2.c",
"src/x8-lut/gen/x8-lut-avx2-u128.c",
"src/x8-transposec/gen/x8-transposec-32x32-reuse-switch-avx2.c",
"src/x16-packw/gen/x16-packw-x16-gemm-goi-avx2-u16-prfm.c",
Expand Down
1 change: 0 additions & 1 deletion gen/avx512f_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ PROD_AVX512F_MICROKERNEL_SRCS = [
"src/f32-vunary/gen/f32-vneg-avx512f.c",
"src/f32-vunary/gen/f32-vsqr-avx512f.c",
"src/s32-f32-vcvt/gen/s32-f32-vcvt-avx512f.c",
"src/u32-f32-vcvt/gen/u32-f32-vcvt-avx512f.c",
"src/x32-packw/gen/x32-packw-x32-gemm-goi-avx512f-u4-prfm.c",
]

Expand Down
1 change: 0 additions & 1 deletion gen/neon_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ PROD_NEON_MICROKERNEL_SRCS = [
"src/u8-maxpool/u8-maxpool-9p8x-minmax-neon-c16.c",
"src/u8-rmax/u8-rmax-neon-u16.c",
"src/u8-vclamp/u8-vclamp-neon-u64.c",
"src/u32-f32-vcvt/gen/u32-f32-vcvt-neon.c",
"src/x8-transposec/gen/x8-transposec-16x16-reuse-dec-zip-neon.c",
"src/x8-zip/x8-zip-x2-neon.c",
"src/x8-zip/x8-zip-x3-neon.c",
Expand Down
1 change: 0 additions & 1 deletion gen/scalar_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [
"src/u8-maxpool/u8-maxpool-9p8x-minmax-scalar-c1.c",
"src/u8-rmax/u8-rmax-scalar-u2.c",
"src/u8-vclamp/u8-vclamp-scalar-u4.c",
"src/u32-f32-vcvt/gen/u32-f32-vcvt-scalar.c",
"src/x8-lut/gen/x8-lut-scalar-u4.c",
"src/x8-packq/x8-packq-scalar-f32qp8-u1.c",
"src/x8-packw/gen/x8-packw-x4-gemm-goi-scalar-u2.c",
Expand Down
1 change: 0 additions & 1 deletion gen/wasmsimd_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,6 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [
"src/u8-ibilinear/gen/u8-ibilinear-wasmsimd-dot16x2-c8.c",
"src/u8-maxpool/u8-maxpool-9p8x-minmax-wasmsimd-c16.c",
"src/u8-vclamp/u8-vclamp-wasmsimd-u64.c",
"src/u32-f32-vcvt/gen/u32-f32-vcvt-wasmsimd.c",
"src/x8-lut/gen/x8-lut-wasmsimd-u32.c",
"src/x8-transposec/gen/x8-transposec-16x16-reuse-mov-wasmsimd.c",
"src/x16-transposec/gen/x16-transposec-8x8-reuse-mov-wasmsimd.c",
Expand Down
7 changes: 0 additions & 7 deletions scripts/generate-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -291,13 +291,6 @@ tools/xngen test/s32-simd.cc.in -D ARCH=avx512f -D ARCH_MACRO="XNN_ARCH_X86 || X
tools/xngen test/s32-simd.cc.in -D ARCH=wasmsimd -D ARCH_MACRO="XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD" -D TEST_REQUIRES="" -o test/s32-simd-wasmsimd.cc &
tools/xngen test/s32-simd.cc.in -D ARCH=hvx -D ARCH_MACRO="XNN_ENABLE_HVX && XNN_ARCH_HEXAGON" -D TEST_REQUIRES=TEST_REQUIRES_HVX -o test/s32-simd-hvx.cc &

tools/xngen test/u32-simd.cc.in -D ARCH=scalar -D ARCH_MACRO="" -D TEST_REQUIRES="" -o test/u32-simd-scalar.cc &
tools/xngen test/u32-simd.cc.in -D ARCH=neon -D ARCH_MACRO="XNN_ARCH_ARM || XNN_ARCH_ARM64" -D TEST_REQUIRES=TEST_REQUIRES_ARM_NEON -o test/u32-simd-neon.cc &
tools/xngen test/u32-simd.cc.in -D ARCH=sse41 -D ARCH_MACRO="XNN_ARCH_X86 || XNN_ARCH_X86_64" -D TEST_REQUIRES=TEST_REQUIRES_X86_SSE41 -o test/u32-simd-sse41.cc &
tools/xngen test/u32-simd.cc.in -D ARCH=avx2 -D ARCH_MACRO="XNN_ARCH_X86 || XNN_ARCH_X86_64" -D TEST_REQUIRES=TEST_REQUIRES_X86_AVX2 -o test/u32-simd-avx2.cc &
tools/xngen test/u32-simd.cc.in -D ARCH=avx512f -D ARCH_MACRO="XNN_ARCH_X86 || XNN_ARCH_X86_64" -D TEST_REQUIRES=TEST_REQUIRES_X86_AVX512F -o test/u32-simd-avx512f.cc &
tools/xngen test/u32-simd.cc.in -D ARCH=wasmsimd -D ARCH_MACRO="XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD" -D TEST_REQUIRES="" -o test/u32-simd-wasmsimd.cc &

tools/xngen test/s8-simd.cc.in -D ARCH=scalar -D ARCH_MACRO="" -D TEST_REQUIRES="" -o test/s8-simd-scalar.cc &

wait
22 changes: 0 additions & 22 deletions scripts/generate-u32-f32-vcvt.sh

This file was deleted.

47 changes: 0 additions & 47 deletions src/configs/unary-elementwise-config.c
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ static struct xnn_unary_elementwise_config qu8_cvt_config = {0};
static struct xnn_unary_elementwise_config qu8_lrelu_config = {0};
static struct xnn_unary_elementwise_config qu8_to_f32_cvt_config = {0};
static struct xnn_unary_elementwise_config s32_to_f32_cvt_config = {0};
static struct xnn_unary_elementwise_config u32_to_f32_cvt_config = {0};
static struct xnn_unary_elementwise_config s8_clamp_config = {0};
static struct xnn_unary_elementwise_config u8_clamp_config = {0};
static struct xnn_unary_elementwise_config xx_copy_config = {0};
Expand Down Expand Up @@ -119,7 +118,6 @@ XNN_INIT_ONCE_GUARD(qu8_cvt);
XNN_INIT_ONCE_GUARD(qu8_lrelu);
XNN_INIT_ONCE_GUARD(qu8_to_f32_cvt);
XNN_INIT_ONCE_GUARD(s32_to_f32_cvt);
XNN_INIT_ONCE_GUARD(u32_to_f32_cvt);
XNN_INIT_ONCE_GUARD(s8_clamp);
XNN_INIT_ONCE_GUARD(u8_clamp);
XNN_INIT_ONCE_GUARD(xx_copy);
Expand Down Expand Up @@ -1498,42 +1496,6 @@ static void init_s32_to_f32_cvt_config(void) {
#endif
}

static void init_u32_to_f32_cvt_config(void) {
#if XNN_ARCH_ARM
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
if (hardware_config->use_arm_neon) {
u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__neon_u16;
u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params;
} else {
u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__scalar_u4;
u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params;
}
#elif XNN_ARCH_X86 || XNN_ARCH_X86_64
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
assert(hardware_config != NULL);
#if XNN_ENABLE_AVX512F
if (!XNN_PLATFORM_MOBILE && hardware_config->use_x86_avx512f) {
u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__avx512f_u64;
u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params;
} else
#endif
if (hardware_config->use_x86_avx2) {
u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__avx2_u32;
u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params;
} else {
u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__scalar_u4;
u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params;
}
#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__wasmsimd_u16;
u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params;
#else
u32_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_u32_f32_vcvt_ukernel__scalar_u4;
u32_to_f32_cvt_config.init = (xnn_init_unary_uparams_fn) xnn_init_u32_f32_cvt_scalar_params;
#endif
}

static void init_qs8_cvt_config(void) {
#if XNN_ARCH_ARM
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
Expand Down Expand Up @@ -2405,15 +2367,6 @@ const struct xnn_unary_elementwise_config* xnn_init_s32_to_f32_cvt_config() {
return &s32_to_f32_cvt_config;
}

const struct xnn_unary_elementwise_config* xnn_init_u32_to_f32_cvt_config() {
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
if (hardware_config == NULL) {
return NULL;
}
XNN_INIT_ONCE(u32_to_f32_cvt);
return &u32_to_f32_cvt_config;
}

const struct xnn_unary_elementwise_config* xnn_init_qs8_cvt_config() {
const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
if (hardware_config == NULL) {
Expand Down
10 changes: 0 additions & 10 deletions src/microparams-init.c
Original file line number Diff line number Diff line change
Expand Up @@ -1266,16 +1266,6 @@ size_t xnn_init_s32_f32_cvt_scalar_params(
return sizeof(params->s32_f32_cvt);
}

size_t xnn_init_u32_f32_cvt_scalar_params(
union xnn_unary_uparams* params,
const union xnn_unary_params* op_params,
const struct xnn_quantization_params* input_quantization,
const struct xnn_quantization_params* output_quantization)
{
params->u32_f32_cvt.scalar.zero_point = input_quantization->zero_point;
return sizeof(params->u32_f32_cvt);
}

size_t xnn_init_qs8_cvt_scalar_params(
union xnn_unary_uparams* params,
const union xnn_unary_params* op_params,
Expand Down
12 changes: 6 additions & 6 deletions src/operator-run.c
Original file line number Diff line number Diff line change
Expand Up @@ -2102,10 +2102,10 @@ void xnn_compute_contiguous_reduce(
context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
output_ptr, (union xnn_unary_uparams*) &cvt_params);
} else if (context->u32_f32_cvt_ukernel) {
struct xnn_u32_f32_cvt_params u32_f32_cvt_params;
u32_f32_cvt_params.scalar.zero_point = context->params.qu8.num_elements * (int32_t) context->params.qu8.input_zero_point;
struct xnn_s32_f32_cvt_params s32_f32_cvt_params;
s32_f32_cvt_params.scalar.zero_point = context->params.qu8.num_elements * (int32_t) context->params.qu8.input_zero_point;
context->u32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
workspace_ptr, (union xnn_unary_uparams*) &u32_f32_cvt_params);
workspace_ptr, (union xnn_unary_uparams*) &s32_f32_cvt_params);
struct xnn_f32_qu8_cvt_params cvt_params;
cvt_params.scalar.scale = context->params.qu8.scale;
cvt_params.scalar.output_zero_point = context->params.qu8.output_zero_point;
Expand Down Expand Up @@ -2185,10 +2185,10 @@ void xnn_compute_discontiguous_reduce(
context->cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
output_ptr, (union xnn_unary_uparams*) &cvt_params);
} else if (context->u32_f32_cvt_ukernel) {
struct xnn_u32_f32_cvt_params u32_f32_cvt_params;
u32_f32_cvt_params.scalar.zero_point = context->params.qu8.num_elements * (int32_t) context->params.qu8.input_zero_point;
struct xnn_s32_f32_cvt_params s32_f32_cvt_params;
s32_f32_cvt_params.scalar.zero_point = context->params.qu8.num_elements * (int32_t) context->params.qu8.input_zero_point;
context->u32_f32_cvt_ukernel(context->accumulation_element_size * output2_block_size, workspace_ptr,
workspace_ptr, (union xnn_unary_uparams*) &u32_f32_cvt_params);
workspace_ptr, (union xnn_unary_uparams*) &s32_f32_cvt_params);
struct xnn_f32_qu8_cvt_params cvt_params;
cvt_params.scalar.scale = context->params.qu8.scale;
cvt_params.scalar.output_zero_point = context->params.qu8.output_zero_point;
Expand Down
4 changes: 3 additions & 1 deletion src/operators/reduce-nd.c
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,9 @@ enum xnn_status xnn_create_reduce_nd(
rdsum_config = xnn_init_qu8_rdsum_config();
cvt_config = xnn_init_f32_to_qu8_cvt_config();
s32_f32_cvt_config = unused;
u32_f32_cvt_config = xnn_init_u32_to_f32_cvt_config();
// We just use an int32 -> f32 conversion. This means we effectively only
// have a 31-bit accumulator instead of 32-bit, but that seems insignificant.
u32_f32_cvt_config = xnn_init_s32_to_f32_cvt_config();
break;
}
default:
Expand Down
Loading

0 comments on commit f81907b

Please sign in to comment.