From 331e10744ffd05bbd51d310c99274e646692c079 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 2 Oct 2024 18:21:22 -0700 Subject: [PATCH] Run generator scripts - update qb8 test/benchmark - sort reduce.h - sort BUILD PiperOrigin-RevId: 681666630 --- BUILD.bazel | 72 ++++++++++++++++---------------- bench/qp8-f32-qb4w-gemm.cc | 8 ++-- src/xnnpack/reduce.h | 18 ++++---- test/qp8-f32-qb4w-gemm-minmax.cc | 4 +- 4 files changed, 51 insertions(+), 51 deletions(-) diff --git a/BUILD.bazel b/BUILD.bazel index 35e2228344f..baa9cec726b 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -61,8 +61,12 @@ exports_files(["preamble.js.lds"]) MICROKERNEL_DEFS = [ "src/bf16-vabs/bf16-vabs.h", "src/f16-avgpool/f16-avgpool-minmax.h", + "src/f16-dwconv/f16-dwconv-minmax-multipass.h", + "src/f16-dwconv/f16-dwconv-minmax-unipass.h", + "src/f16-f32-vcvt/f16-f32-vcvt.h", "src/f16-maxpool/f16-maxpool-minmax.h", "src/f16-pavgpool/f16-pavgpool-minmax.h", + "src/f16-qs8-vcvt/f16-qs8-vcvt.h", "src/f16-vabs/f16-vabs.h", "src/f16-vbinary/f16-vadd.h", "src/f16-vbinary/f16-vaddc.h", @@ -99,8 +103,15 @@ MICROKERNEL_DEFS = [ "src/f16-vsqrt/f16-vsqrt.h", "src/f16-vtanh/f16-vtanh.h", "src/f32-avgpool/f32-avgpool-minmax.h", + "src/f32-dwconv/f32-dwconv-minmax-multipass.h", + "src/f32-dwconv/f32-dwconv-minmax-unipass.h", + "src/f32-dwconv/f32-dwconv-multipass.h", + "src/f32-dwconv/f32-dwconv-unipass.h", + "src/f32-f16-vcvt/f32-f16-vcvt.h", "src/f32-maxpool/f32-maxpool-minmax.h", "src/f32-pavgpool/f32-pavgpool-minmax.h", + "src/f32-qs8-vcvt/f32-qs8-vcvt.h", + "src/f32-qu8-vcvt/f32-qu8-vcvt.h", "src/f32-vabs/f32-vabs.h", "src/f32-vbinary/f32-vadd.h", "src/f32-vbinary/f32-vaddc.h", @@ -143,18 +154,34 @@ MICROKERNEL_DEFS = [ "src/f32-vsqr/f32-vsqr.h", "src/f32-vsqrt/f32-vsqrt.h", "src/f32-vtanh/f32-vtanh.h", + "src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h", + "src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h", + "src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h", + "src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h", + "src/qs8-f16-vcvt/qs8-f16-vcvt.h", + "src/qs8-f32-vcvt/qs8-f32-vcvt.h", "src/qs8-packw/qs8-packw.h", + "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h", + "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h", "src/qs8-vadd/qs8-vadd-minmax.h", "src/qs8-vaddc/qs8-vaddc-minmax.h", + "src/qs8-vcvt/qs8-vcvt.h", "src/qs8-vhswish/qs8-vhswish.h", "src/qs8-vlrelu/qs8-vlrelu.h", "src/qs8-vmul/qs8-vmul-minmax-fp32.h", "src/qs8-vmul/qs8-vmul-minmax-rndnu.h", "src/qs8-vmulc/qs8-vmulc-minmax-fp32.h", "src/qs8-vmulc/qs8-vmulc-minmax-rndnu.h", + "src/qs16-qs8-vcvt/qs16-qs8-vcvt.h", "src/qu8-avgpool/qu8-avgpool-minmax.h", + "src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h", + "src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h", + "src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h", + "src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h", + "src/qu8-f32-vcvt/qu8-f32-vcvt.h", "src/qu8-vadd/qu8-vadd-minmax.h", "src/qu8-vaddc/qu8-vaddc-minmax.h", + "src/qu8-vcvt/qu8-vcvt.h", "src/qu8-vhswish/qu8-vhswish.h", "src/qu8-vlrelu/qu8-vlrelu.h", "src/qu8-vmul/qu8-vmul-minmax-fp32.h", @@ -163,54 +190,27 @@ MICROKERNEL_DEFS = [ "src/qu8-vmulc/qu8-vmulc-minmax-rndnu.h", "src/s8-maxpool/s8-maxpool-minmax.h", "src/s8-vclamp/s8-vclamp.h", + "src/s32-f32-vcvt/s32-f32-vcvt.h", "src/s32-vmul/s32-vmul.h", "src/s32-vmul/s32-vmulc.h", "src/u8-maxpool/u8-maxpool-minmax.h", "src/u8-vclamp/u8-vclamp.h", + "src/u32-f32-vcvt/u32-f32-vcvt.h", + "src/xx-fill/xx-fill.h", + "src/xx-pad/xx-pad.h", + "src/xx-transposev/xx-transposev.h", "src/x8-packq/x8-packq.h", "src/x8-packw/x8-packw.h", + "src/x8-transposec/x8-transposec.h", "src/x16-packw/x16-packw.h", + "src/x16-transposec/x16-transposec.h", + "src/x24-transposec/x24-transposec.h", "src/x32-packb/x32-packb.h", "src/x32-packw/x32-packw.h", "src/x32-packx/x32-packx.h", - "src/x32-zerob/x32-zerob.h", - "src/f16-f32-vcvt/f16-f32-vcvt.h", - "src/f32-qs8-vcvt/f32-qs8-vcvt.h", - "src/qs8-f16-vcvt/qs8-f16-vcvt.h", - "src/qu8-f32-vcvt/qu8-f32-vcvt.h", - "src/f16-qs8-vcvt/f16-qs8-vcvt.h", - "src/f32-qu8-vcvt/f32-qu8-vcvt.h", - "src/qs8-f32-vcvt/qs8-f32-vcvt.h", - "src/qu8-vcvt/qu8-vcvt.h", - "src/f32-f16-vcvt/f32-f16-vcvt.h", - "src/qs16-qs8-vcvt/qs16-qs8-vcvt.h", - "src/qs8-vcvt/qs8-vcvt.h", - "src/s32-f32-vcvt/s32-f32-vcvt.h", - "src/u32-f32-vcvt/u32-f32-vcvt.h", - "src/x8-transposec/x8-transposec.h", - "src/x16-transposec/x16-transposec.h", - "src/x24-transposec/x24-transposec.h", "src/x32-transposec/x32-transposec.h", + "src/x32-zerob/x32-zerob.h", "src/x64-transposec/x64-transposec.h", - "src/xx-transposev/xx-transposev.h", - "src/xx-fill/xx-fill.h", - "src/xx-pad/xx-pad.h", - "src/f16-dwconv/f16-dwconv-minmax-unipass.h", - "src/f32-dwconv/f32-dwconv-minmax-unipass.h", - "src/f32-dwconv/f32-dwconv-unipass.h", - "src/qs8-dwconv/qs8-dwconv-minmax-unipass-fp32.h", - "src/qs8-dwconv/qs8-dwconv-minmax-unipass-rndnu.h", - "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-unipass-fp32.h", - "src/qu8-dwconv/qu8-dwconv-minmax-unipass-fp32.h", - "src/qu8-dwconv/qu8-dwconv-minmax-unipass-rndnu.h", - "src/f16-dwconv/f16-dwconv-minmax-multipass.h", - "src/f32-dwconv/f32-dwconv-minmax-multipass.h", - "src/f32-dwconv/f32-dwconv-multipass.h", - "src/qs8-dwconv/qs8-dwconv-minmax-multipass-fp32.h", - "src/qs8-dwconv/qs8-dwconv-minmax-multipass-rndnu.h", - "src/qs8-qc8w-dwconv/qs8-qc8w-dwconv-minmax-multipass-fp32.h", - "src/qu8-dwconv/qu8-dwconv-minmax-multipass-fp32.h", - "src/qu8-dwconv/qu8-dwconv-minmax-multipass-rndnu.h", ] MICROKERNEL_HDRS = [ diff --git a/bench/qp8-f32-qb4w-gemm.cc b/bench/qp8-f32-qb4w-gemm.cc index 75de5c9c8dc..601cc7a1ad3 100644 --- a/bench/qp8-f32-qb4w-gemm.cc +++ b/bench/qp8-f32-qb4w-gemm.cc @@ -21,7 +21,7 @@ #if XNN_ENABLE_ARM_I8MM && XNN_ARCH_ARM64 #if XNN_ENABLE_KLEIDIAI - static void qp8_f32_qb4w_gemm_minmax_ukernel_4x8c16s2__aarch64_neoni8mm(benchmark::State& state, const char* net) { + static void qp8_f32_qb4w_gemm_minmax_ukernel_4x8c16s2__neoni8mm(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_qp8_f32_qb4w_gemm_minmax_ukernel_4x8c16s2__neoni8mm, xnn_init_f32_qb4w_minmax_scalar_params, @@ -32,9 +32,9 @@ benchmark::utils::CheckNEONI8MM); } - BENCHMARK_GEMM_BL(qp8_f32_qb4w_gemm_minmax_ukernel_4x8c16s2__aarch64_neoni8mm) + BENCHMARK_GEMM_BL(qp8_f32_qb4w_gemm_minmax_ukernel_4x8c16s2__neoni8mm) - static void qp8_f32_qb4w_gemm_minmax_ukernel_8x4c16s2__aarch64_neoni8mm_mstep2(benchmark::State& state, const char* net) { + static void qp8_f32_qb4w_gemm_minmax_ukernel_8x4c16s2__neoni8mm_mstep2(benchmark::State& state, const char* net) { GEMMBenchmark(state, xnn_qp8_f32_qb4w_gemm_minmax_ukernel_8x4c16s2__neoni8mm_mstep2, xnn_init_f32_qb4w_minmax_scalar_params, @@ -45,7 +45,7 @@ benchmark::utils::CheckNEONI8MM); } - BENCHMARK_GEMM_BL(qp8_f32_qb4w_gemm_minmax_ukernel_8x4c16s2__aarch64_neoni8mm_mstep2) + BENCHMARK_GEMM_BL(qp8_f32_qb4w_gemm_minmax_ukernel_8x4c16s2__neoni8mm_mstep2) #endif // XNN_ENABLE_KLEIDIAI #endif // XNN_ENABLE_ARM_I8MM && XNN_ARCH_ARM64 diff --git a/src/xnnpack/reduce.h b/src/xnnpack/reduce.h index 956453161a5..fded712f297 100644 --- a/src/xnnpack/reduce.h +++ b/src/xnnpack/reduce.h @@ -413,27 +413,27 @@ DECLARE_QS8_RSUM_UKERNEL_FUNCTION(xnn_qs8_rsum_ukernel__ssse3_u64_acc4) uint32_t* output, \ const struct xnn_qs8_rsum_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]); +DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u32) +DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u64) +DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u64_acc2) +DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u128) +DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u128_acc2) +DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u128_acc4) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__neon_u16) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__neon_u32) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__neon_u32_acc2) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__neon_u64) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__neon_u64_acc2) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__neon_u64_acc4) +DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__scalar_u1) +DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__scalar_u2) +DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__scalar_u4) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__sse2_u16) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__sse2_u32) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__sse2_u32_acc2) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__sse2_u64) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__sse2_u64_acc2) DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__sse2_u64_acc4) -DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u32) -DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u64) -DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u64_acc2) -DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u128) -DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u128_acc2) -DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__avx2_u128_acc4) -DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__scalar_u1) -DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__scalar_u2) -DECLARE_QU8_RSUM_UKERNEL_FUNCTION(xnn_qu8_rsum_ukernel__scalar_u4) #define DECLARE_F32_RDSUM_UKERNEL_FUNCTION(fn_name) \ XNN_INTERNAL void fn_name( \ diff --git a/test/qp8-f32-qb4w-gemm-minmax.cc b/test/qp8-f32-qb4w-gemm-minmax.cc index 033592c3d95..62bc212789f 100644 --- a/test/qp8-f32-qb4w-gemm-minmax.cc +++ b/test/qp8-f32-qb4w-gemm-minmax.cc @@ -158,7 +158,7 @@ std::vector CreateTests1( #if XNN_ENABLE_ARM_I8MM && XNN_ARCH_ARM64 #if XNN_ENABLE_KLEIDIAI INSTANTIATE_TEST_SUITE_P( - QP8_F32_QB4W_GEMM_MINMAX_4X8C16S2__AARCH64_NEONI8MM, GemmTest, + QP8_F32_QB4W_GEMM_MINMAX_4X8C16S2__NEONI8MM, GemmTest, testing::ValuesIn(CreateTests1( /*k_block=*/32, /*adj_k_block=*/32, @@ -180,7 +180,7 @@ std::vector CreateTests1( INSTANTIATE_TEST_SUITE_P( - QP8_F32_QB4W_GEMM_MINMAX_8X4C16S2__AARCH64_NEONI8MM_MSTEP2, GemmTest, + QP8_F32_QB4W_GEMM_MINMAX_8X4C16S2__NEONI8MM_MSTEP2, GemmTest, testing::ValuesIn(CreateTests1( /*k_block=*/32, /*adj_k_block=*/32,