diff --git a/CMakeLists.txt b/CMakeLists.txt index c94020d7e74..1877c8dec45 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -430,7 +430,6 @@ SET(OPERATOR_SRCS src/operators/average-pooling-nhwc.c src/operators/batch-matrix-multiply-nc.c src/operators/binary-elementwise-nd.c - src/operators/channel-shuffle-nc.c src/operators/constant-pad-nd.c src/operators/convolution-nchw.c src/operators/convolution-nhwc.c @@ -523,7 +522,6 @@ SET(XNNPACK_SRCS src/configs/xx-fill-config.c src/configs/xx-pad-config.c src/configs/x8-lut-config.c - src/configs/zip-config.c src/init.c src/params.c "${PROJECT_BINARY_DIR}/build_identifier.c") @@ -1472,12 +1470,10 @@ IF(XNNPACK_BUILD_TESTS) x32-packw x32-packx x32-unpool - x32-zip x8-lut x8-packw qs8-packw qs8-qc4w-packw - x8-zip xN-transpose xx-fill xx-pad) @@ -1875,7 +1871,6 @@ IF(XNNPACK_BUILD_BENCHMARKS) # ---[ Build operator-level microbenchmarks SET(LIBRARY_OPERATOR_BENCHMARKS average-pooling - channel-shuffle convolution deconvolution max-pooling diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel index 16eacfc836b..93380b3a908 100644 --- a/bench/BUILD.bazel +++ b/bench/BUILD.bazel @@ -608,12 +608,6 @@ xnnpack_benchmark( ], ) -xnnpack_benchmark( - name = "channel_shuffle_bench", - srcs = ["channel-shuffle.cc"], - deps = OPERATOR_BENCHMARK_DEPS, -) - xnnpack_benchmark( name = "convolution_bench", srcs = ["convolution.cc"], diff --git a/bench/channel-shuffle.cc b/bench/channel-shuffle.cc deleted file mode 100644 index 0a9f820ba3a..00000000000 --- a/bench/channel-shuffle.cc +++ /dev/null @@ -1,340 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include -#include -#include - -#include "xnnpack.h" - -#include -#include "utils.h" -#include "xnnpack/buffer.h" - - -static void channel_shuffle_x8(benchmark::State& state, const char* net) { - const size_t batch_size = static_cast(state.range(0)); - const size_t groups = static_cast(state.range(1)); - const size_t group_channels = static_cast(state.range(2)); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(uint8_t) + batch_size * groups * group_channels); - xnnpack::Buffer output(batch_size * groups * group_channels); - xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng); - - xnn_status status = xnn_initialize(nullptr /* allocator */); - if (status != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - xnn_operator_t channel_shuffle_op = nullptr; - status = xnn_create_channel_shuffle_nc_x8( - groups, group_channels, - groups * group_channels /* input stride */, - groups * group_channels /* output stride */, - 0 /* flags */, &channel_shuffle_op); - if (status != xnn_status_success || channel_shuffle_op == nullptr) { - state.SkipWithError("failed to create X8 Channel Shuffle operator"); - return; - } - - status = xnn_reshape_channel_shuffle_nc_x8( - channel_shuffle_op, - batch_size, - /*threadpool=*/nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to reshape X8 Channel Shuffle operator"); - return; - } - - status = xnn_setup_channel_shuffle_nc_x8( - channel_shuffle_op, - input.data(), output.data()); - if (status != xnn_status_success) { - state.SkipWithError("failed to setup X8 Channel Shuffle operator"); - return; - } - - for (auto _ : state) { - status = xnn_run_operator(channel_shuffle_op, /*threadpool=*/nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run X8 Channel Shuffle operator"); - return; - } - } - - status = xnn_delete_operator(channel_shuffle_op); - if (status != xnn_status_success) { - state.SkipWithError("failed to delete X8 Channel Shuffle operator"); - return; - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t elements_per_iteration = batch_size * groups * group_channels; - state.counters["elements"] = - benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(uint8_t); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -static void channel_shuffle_x32(benchmark::State& state, const char* net) { - const size_t batch_size = static_cast(state.range(0)); - const size_t groups = static_cast(state.range(1)); - const size_t group_channels = static_cast(state.range(2)); - - std::random_device random_device; - auto rng = std::mt19937(random_device()); - auto f32rng = std::bind(std::uniform_real_distribution(), std::ref(rng)); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(float) + batch_size * groups * group_channels); - xnnpack::Buffer output(batch_size * groups * group_channels); - std::generate(input.begin(), input.end(), std::ref(f32rng)); - - xnn_status status = xnn_initialize(nullptr /* allocator */); - if (status != xnn_status_success) { - state.SkipWithError("failed to initialize XNNPACK"); - return; - } - - xnn_operator_t channel_shuffle_op = nullptr; - status = xnn_create_channel_shuffle_nc_x32( - groups, group_channels, - groups * group_channels /* input stride */, - groups * group_channels /* output stride */, - 0 /* flags */, &channel_shuffle_op); - if (status != xnn_status_success || channel_shuffle_op == nullptr) { - state.SkipWithError("failed to create X32 Channel Shuffle operator"); - return; - } - - status = xnn_reshape_channel_shuffle_nc_x32( - channel_shuffle_op, - batch_size, - /*threadpool=*/nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to reshape X32 Channel Shuffle operator"); - return; - } - - status = xnn_setup_channel_shuffle_nc_x32( - channel_shuffle_op, - input.data(), output.data()); - if (status != xnn_status_success) { - state.SkipWithError("failed to setup X32 Channel Shuffle operator"); - return; - } - - for (auto _ : state) { - status = xnn_run_operator(channel_shuffle_op, /*threadpool=*/nullptr); - if (status != xnn_status_success) { - state.SkipWithError("failed to run X32 Channel Shuffle operator"); - return; - } - } - - status = xnn_delete_operator(channel_shuffle_op); - if (status != xnn_status_success) { - state.SkipWithError("failed to delete X32 Channel Shuffle operator"); - return; - } - - const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency(); - if (cpu_frequency != 0) { - state.counters["cpufreq"] = cpu_frequency; - } - - const size_t elements_per_iteration = batch_size * groups * group_channels; - state.counters["elements"] = - benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate); - - const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(float); - state.counters["bytes"] = - benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate); -} - -static void ShuffleNetV1G2Arguments(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"N", "G", "GC"}); - - /******** Stage 2 ********/ - /* H W G CG */ - b->Args({56 * 56, 2, 25}); - b->Args({28 * 28, 2, 25}); - - /******** Stage 3 ********/ - /* H W G CG */ - b->Args({28 * 28, 2, 50}); - b->Args({14 * 14, 2, 50}); - - /******** Stage 4 ********/ - /* H W G CG */ - b->Args({14 * 14, 2, 100}); - b->Args({ 7 * 7, 2, 100}); -} - -static void ShuffleNetV1G3Arguments(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"N", "G", "GC"}); - - /******** Stage 2 *******/ - /* H W G CG */ - b->Args({56 * 56, 3, 20}); - b->Args({28 * 28, 3, 20}); - - /******** Stage 3 *******/ - /* H W G CG */ - b->Args({28 * 28, 3, 40}); - b->Args({14 * 14, 3, 40}); - - /******** Stage 4 *******/ - /* H W G CG */ - b->Args({14 * 14, 3, 80}); - b->Args({ 7 * 7, 3, 80}); -} - -static void ShuffleNetV1G4Arguments(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"N", "G", "GC"}); - - /******** Stage 2 *******/ - /* H W G CG */ - b->Args({56 * 56, 4, 17}); - b->Args({28 * 28, 4, 17}); - - /******** Stage 3 *******/ - /* H W G CG */ - b->Args({28 * 28, 4, 34}); - b->Args({14 * 14, 4, 34}); - - /******** Stage 4 *******/ - /* H W G CG */ - b->Args({14 * 14, 4, 68}); - b->Args({ 7 * 7, 4, 68}); -} - -static void ShuffleNetV1G8Arguments(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"N", "G", "GC"}); - - /******** Stage 2 *******/ - /* H W G CG */ - b->Args({56 * 56, 8, 12}); - b->Args({28 * 28, 8, 12}); - - /******** Stage 3 *******/ - /* H W G CG */ - b->Args({28 * 28, 8, 24}); - b->Args({14 * 14, 8, 24}); - - /******** Stage 4 *******/ - /* H W G CG */ - b->Args({14 * 14, 8, 48}); - b->Args({ 7 * 7, 8, 48}); -} - -static void ShuffleNetV2x0_5Arguments(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"N", "G", "GC"}); - - /******** Stage 2 *******/ - /* H W G CG */ - b->Args({28 * 28, 2, 24}); - - /******** Stage 3 *******/ - /* H W G CG */ - b->Args({14 * 14, 2, 48}); - - /******** Stage 4 *******/ - /* H W G CG */ - b->Args({ 7 * 7, 2, 96}); -} - -static void ShuffleNetV2x1_0Arguments(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"N", "G", "GC"}); - - /******** Stage 2 ********/ - /* H W G CG */ - b->Args({28 * 28, 2, 58}); - - /******** Stage 3 ********/ - /* H W G CG */ - b->Args({14 * 14, 2, 116}); - - /******** Stage 4 ********/ - /* H W G CG */ - b->Args({ 7 * 7, 2, 232}); -} - -static void ShuffleNetV2x1_5Arguments(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"N", "G", "GC"}); - - /******** Stage 2 ********/ - /* H W G CG */ - b->Args({28 * 28, 2, 88}); - - /******** Stage 3 ********/ - /* H W G CG */ - b->Args({14 * 14, 2, 176}); - - /******** Stage 4 ********/ - /* H W G CG */ - b->Args({ 7 * 7, 2, 352}); -} - -static void ShuffleNetV2x2_0Arguments(benchmark::internal::Benchmark* b) -{ - b->ArgNames({"N", "G", "GC"}); - - /******** Stage 2 ********/ - /* H W G CG */ - b->Args({28 * 28, 2, 122}); - - /******** Stage 3 ********/ - /* H W G CG */ - b->Args({14 * 14, 2, 244}); - - /******** Stage 4 ********/ - /* H W G CG */ - b->Args({ 7 * 7, 2, 488}); -} - -BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x05, "ShuffleNet v2 x0.5")->Apply(ShuffleNetV2x0_5Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x10, "ShuffleNet v2 x1.0")->Apply(ShuffleNetV2x1_0Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x15, "ShuffleNet v2 x1.5")->Apply(ShuffleNetV2x1_5Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x20, "ShuffleNet v2 x2.0")->Apply(ShuffleNetV2x2_0Arguments)->UseRealTime(); - -BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x05, "ShuffleNet v2 x0.5")->Apply(ShuffleNetV2x0_5Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x10, "ShuffleNet v2 x1.0")->Apply(ShuffleNetV2x1_0Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x15, "ShuffleNet v2 x1.5")->Apply(ShuffleNetV2x1_5Arguments)->UseRealTime(); -BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x20, "ShuffleNet v2 x2.0")->Apply(ShuffleNetV2x2_0Arguments)->UseRealTime(); - -#ifndef XNNPACK_BENCHMARK_NO_MAIN -BENCHMARK_MAIN(); -#endif diff --git a/build_srcs.bzl b/build_srcs.bzl index bd75d4b55b4..e4c38f759c9 100644 --- a/build_srcs.bzl +++ b/build_srcs.bzl @@ -13,7 +13,6 @@ OPERATOR_SRCS = [ "src/operators/average-pooling-nhwc.c", "src/operators/batch-matrix-multiply-nc.c", "src/operators/binary-elementwise-nd.c", - "src/operators/channel-shuffle-nc.c", "src/operators/constant-pad-nd.c", "src/operators/convolution-nchw.c", "src/operators/convolution-nhwc.c", @@ -108,7 +107,6 @@ XNNPACK_SRCS = [ "src/configs/x8-lut-config.c", "src/configs/xx-fill-config.c", "src/configs/xx-pad-config.c", - "src/configs/zip-config.c", ] LOGGING_SRCS = [ diff --git a/cmake/gen/neon_microkernels.cmake b/cmake/gen/neon_microkernels.cmake index ec7b7a3ade6..6e714381fcb 100644 --- a/cmake/gen/neon_microkernels.cmake +++ b/cmake/gen/neon_microkernels.cmake @@ -151,10 +151,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/u8-rmax/u8-rmax-neon-u16.c src/u8-vclamp/u8-vclamp-neon-u64.c src/x8-transposec/gen/x8-transposec-16x16-reuse-dec-zip-neon.c - src/x8-zip/x8-zip-x2-neon.c - src/x8-zip/x8-zip-x3-neon.c - src/x8-zip/x8-zip-x4-neon.c - src/x8-zip/x8-zip-xm-neon.c src/x16-packw/gen/x16-packw-x8-gemm-goi-neon-ld4lane-u8-prfm.c src/x16-packw/gen/x16-packw-x16-gemm-goi-neon-ld4lane-u8-prfm.c src/x16-transposec/gen/x16-transposec-8x8-reuse-dec-zip-neon.c @@ -164,10 +160,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS src/x32-packw/gen/x32-packw-x8s4-gemm-goi-neon-ld4lane-u4-prfm.c src/x32-transposec/gen/x32-transposec-4x4-reuse-dec-zip-neon.c src/x32-unpool/x32-unpool-neon.c - src/x32-zip/x32-zip-x2-neon.c - src/x32-zip/x32-zip-x3-neon.c - src/x32-zip/x32-zip-x4-neon.c - src/x32-zip/x32-zip-xm-neon.c src/x64-transposec/gen/x64-transposec-2x2-multi-dec-zip-neon.c src/x64-transposec/gen/x64-transposec-2x2-reuse-dec-zip-neon.c src/xx-fill/xx-fill-neon-u64.c diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake index 89b5b0fdc0c..a394b9f01ed 100644 --- a/cmake/gen/scalar_microkernels.cmake +++ b/cmake/gen/scalar_microkernels.cmake @@ -231,10 +231,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/x8-packw/gen/x8-packw-x16-gemm-goi-scalar-u2.c src/x8-packw/gen/x8-packw-x32-gemm-goi-scalar-u2.c src/x8-transposec/gen/x8-transposec-2x4-scalar-int.c - src/x8-zip/x8-zip-x2-scalar.c - src/x8-zip/x8-zip-x3-scalar.c - src/x8-zip/x8-zip-x4-scalar.c - src/x8-zip/x8-zip-xm-scalar.c src/x16-packw/gen/x16-packw-x64-gemm-goi-scalar-int-u4.c src/x16-transposec/gen/x16-transposec-2x4-scalar-int.c src/x24-transposec/gen/x24-transposec-1x2-scalar.c @@ -242,10 +238,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS src/x32-packw/gen/x32-packw-x4-gemm-goi-scalar-float-u4.c src/x32-transposec/gen/x32-transposec-2x4-scalar-int.c src/x32-unpool/x32-unpool-scalar.c - src/x32-zip/x32-zip-x2-scalar.c - src/x32-zip/x32-zip-x3-scalar.c - src/x32-zip/x32-zip-x4-scalar.c - src/x32-zip/x32-zip-xm-scalar.c src/x64-transposec/gen/x64-transposec-4x2-scalar-int.c src/xx-copy/xx-copy-scalar-memcpy.c src/xx-fill/xx-fill-scalar-u16.c diff --git a/cmake/gen/sse2_microkernels.cmake b/cmake/gen/sse2_microkernels.cmake index a6d675da34a..0615ceeec0c 100644 --- a/cmake/gen/sse2_microkernels.cmake +++ b/cmake/gen/sse2_microkernels.cmake @@ -86,18 +86,10 @@ SET(PROD_SSE2_MICROKERNEL_SRCS src/u8-rmax/u8-rmax-sse2-u16.c src/u8-vclamp/u8-vclamp-sse2-u64.c src/x8-transposec/gen/x8-transposec-16x16-reuse-mov-sse2.c - src/x8-zip/x8-zip-x2-sse2.c - src/x8-zip/x8-zip-x3-sse2.c - src/x8-zip/x8-zip-x4-sse2.c - src/x8-zip/x8-zip-xm-sse2.c src/x16-transposec/gen/x16-transposec-8x8-reuse-multi-sse2.c src/x32-packw/gen/x32-packw-x2c4-gemm-goi-sse2-u4.c src/x32-packw/gen/x32-packw-x8-gemm-goi-sse2-u4.c src/x32-unpool/x32-unpool-sse2.c - src/x32-zip/x32-zip-x2-sse2.c - src/x32-zip/x32-zip-x3-sse2.c - src/x32-zip/x32-zip-x4-sse2.c - src/x32-zip/x32-zip-xm-sse2.c src/x64-transposec/gen/x64-transposec-2x2-multi-mov-sse2.c src/xx-fill/xx-fill-sse2-u64.c src/xx-pad/xx-pad-p16-sse2-u16.c) diff --git a/cmake/gen/wasmsimd_microkernels.cmake b/cmake/gen/wasmsimd_microkernels.cmake index 19100e1bbb4..2d93b7d9717 100644 --- a/cmake/gen/wasmsimd_microkernels.cmake +++ b/cmake/gen/wasmsimd_microkernels.cmake @@ -216,10 +216,6 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS src/x32-packw/gen/x32-packw-x8-gemm-goi-wasmsimd-u4.c src/x32-transposec/gen/x32-transposec-4x4-reuse-mov-wasmsimd.c src/x32-unpool/x32-unpool-wasmsimd.c - src/x32-zip/x32-zip-x2-wasmsimd.c - src/x32-zip/x32-zip-x3-wasmsimd.c - src/x32-zip/x32-zip-x4-wasmsimd.c - src/x32-zip/x32-zip-xm-wasmsimd.c src/xx-fill/xx-fill-wasmsimd-u64.c src/xx-pad/xx-pad-p16-wasmsimd-u16.c) diff --git a/gen/neon_microkernels.bzl b/gen/neon_microkernels.bzl index ac25b28f04c..9255594ed8e 100644 --- a/gen/neon_microkernels.bzl +++ b/gen/neon_microkernels.bzl @@ -147,10 +147,6 @@ PROD_NEON_MICROKERNEL_SRCS = [ "src/u8-rmax/u8-rmax-neon-u16.c", "src/u8-vclamp/u8-vclamp-neon-u64.c", "src/x8-transposec/gen/x8-transposec-16x16-reuse-dec-zip-neon.c", - "src/x8-zip/x8-zip-x2-neon.c", - "src/x8-zip/x8-zip-x3-neon.c", - "src/x8-zip/x8-zip-x4-neon.c", - "src/x8-zip/x8-zip-xm-neon.c", "src/x16-packw/gen/x16-packw-x8-gemm-goi-neon-ld4lane-u8-prfm.c", "src/x16-packw/gen/x16-packw-x16-gemm-goi-neon-ld4lane-u8-prfm.c", "src/x16-transposec/gen/x16-transposec-8x8-reuse-dec-zip-neon.c", @@ -160,10 +156,6 @@ PROD_NEON_MICROKERNEL_SRCS = [ "src/x32-packw/gen/x32-packw-x8s4-gemm-goi-neon-ld4lane-u4-prfm.c", "src/x32-transposec/gen/x32-transposec-4x4-reuse-dec-zip-neon.c", "src/x32-unpool/x32-unpool-neon.c", - "src/x32-zip/x32-zip-x2-neon.c", - "src/x32-zip/x32-zip-x3-neon.c", - "src/x32-zip/x32-zip-x4-neon.c", - "src/x32-zip/x32-zip-xm-neon.c", "src/x64-transposec/gen/x64-transposec-2x2-multi-dec-zip-neon.c", "src/x64-transposec/gen/x64-transposec-2x2-reuse-dec-zip-neon.c", "src/xx-fill/xx-fill-neon-u64.c", diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl index c8cce31c401..2b65594d1c4 100644 --- a/gen/scalar_microkernels.bzl +++ b/gen/scalar_microkernels.bzl @@ -227,10 +227,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/x8-packw/gen/x8-packw-x16-gemm-goi-scalar-u2.c", "src/x8-packw/gen/x8-packw-x32-gemm-goi-scalar-u2.c", "src/x8-transposec/gen/x8-transposec-2x4-scalar-int.c", - "src/x8-zip/x8-zip-x2-scalar.c", - "src/x8-zip/x8-zip-x3-scalar.c", - "src/x8-zip/x8-zip-x4-scalar.c", - "src/x8-zip/x8-zip-xm-scalar.c", "src/x16-packw/gen/x16-packw-x64-gemm-goi-scalar-int-u4.c", "src/x16-transposec/gen/x16-transposec-2x4-scalar-int.c", "src/x24-transposec/gen/x24-transposec-1x2-scalar.c", @@ -238,10 +234,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [ "src/x32-packw/gen/x32-packw-x4-gemm-goi-scalar-float-u4.c", "src/x32-transposec/gen/x32-transposec-2x4-scalar-int.c", "src/x32-unpool/x32-unpool-scalar.c", - "src/x32-zip/x32-zip-x2-scalar.c", - "src/x32-zip/x32-zip-x3-scalar.c", - "src/x32-zip/x32-zip-x4-scalar.c", - "src/x32-zip/x32-zip-xm-scalar.c", "src/x64-transposec/gen/x64-transposec-4x2-scalar-int.c", "src/xx-copy/xx-copy-scalar-memcpy.c", "src/xx-fill/xx-fill-scalar-u16.c", diff --git a/gen/sse2_microkernels.bzl b/gen/sse2_microkernels.bzl index 8375c7af656..f5d1f36709b 100644 --- a/gen/sse2_microkernels.bzl +++ b/gen/sse2_microkernels.bzl @@ -82,18 +82,10 @@ PROD_SSE2_MICROKERNEL_SRCS = [ "src/u8-rmax/u8-rmax-sse2-u16.c", "src/u8-vclamp/u8-vclamp-sse2-u64.c", "src/x8-transposec/gen/x8-transposec-16x16-reuse-mov-sse2.c", - "src/x8-zip/x8-zip-x2-sse2.c", - "src/x8-zip/x8-zip-x3-sse2.c", - "src/x8-zip/x8-zip-x4-sse2.c", - "src/x8-zip/x8-zip-xm-sse2.c", "src/x16-transposec/gen/x16-transposec-8x8-reuse-multi-sse2.c", "src/x32-packw/gen/x32-packw-x2c4-gemm-goi-sse2-u4.c", "src/x32-packw/gen/x32-packw-x8-gemm-goi-sse2-u4.c", "src/x32-unpool/x32-unpool-sse2.c", - "src/x32-zip/x32-zip-x2-sse2.c", - "src/x32-zip/x32-zip-x3-sse2.c", - "src/x32-zip/x32-zip-x4-sse2.c", - "src/x32-zip/x32-zip-xm-sse2.c", "src/x64-transposec/gen/x64-transposec-2x2-multi-mov-sse2.c", "src/xx-fill/xx-fill-sse2-u64.c", "src/xx-pad/xx-pad-p16-sse2-u16.c", diff --git a/gen/wasmsimd_microkernels.bzl b/gen/wasmsimd_microkernels.bzl index a7fefd4e202..2925413d9d6 100644 --- a/gen/wasmsimd_microkernels.bzl +++ b/gen/wasmsimd_microkernels.bzl @@ -212,10 +212,6 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [ "src/x32-packw/gen/x32-packw-x8-gemm-goi-wasmsimd-u4.c", "src/x32-transposec/gen/x32-transposec-4x4-reuse-mov-wasmsimd.c", "src/x32-unpool/x32-unpool-wasmsimd.c", - "src/x32-zip/x32-zip-x2-wasmsimd.c", - "src/x32-zip/x32-zip-x3-wasmsimd.c", - "src/x32-zip/x32-zip-x4-wasmsimd.c", - "src/x32-zip/x32-zip-xm-wasmsimd.c", "src/xx-fill/xx-fill-wasmsimd-u64.c", "src/xx-pad/xx-pad-p16-wasmsimd-u16.c", ] diff --git a/include/xnnpack.h b/include/xnnpack.h index ad38d79a34a..47ebbfc474e 100644 --- a/include/xnnpack.h +++ b/include/xnnpack.h @@ -2680,42 +2680,6 @@ enum xnn_status xnn_setup_batch_matrix_multiply_nc_qd8_f32_qc8w( const struct xnn_quantization_params* quantization_params, float* output); -enum xnn_status xnn_create_channel_shuffle_nc_x8( - size_t groups, - size_t group_channels, - size_t input_stride, - size_t output_stride, - uint32_t flags, - xnn_operator_t* channel_shuffle_op_out); - -enum xnn_status xnn_reshape_channel_shuffle_nc_x8( - xnn_operator_t channel_shuffle_op, - size_t batch_size, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_channel_shuffle_nc_x8( - xnn_operator_t channel_shuffle_op, - const void* input, - void* output); - -enum xnn_status xnn_create_channel_shuffle_nc_x32( - size_t groups, - size_t group_channels, - size_t input_stride, - size_t output_stride, - uint32_t flags, - xnn_operator_t* channel_shuffle_op_out); - -enum xnn_status xnn_reshape_channel_shuffle_nc_x32( - xnn_operator_t channel_shuffle_op, - size_t batch_size, - pthreadpool_t threadpool); - -enum xnn_status xnn_setup_channel_shuffle_nc_x32( - xnn_operator_t channel_shuffle_op, - const void* input, - void* output); - enum xnn_status xnn_create_constant_pad_nd_x8( const void* padding_value, uint32_t flags, diff --git a/src/configs/zip-config.c b/src/configs/zip-config.c deleted file mode 100644 index 4a5d3044e6a..00000000000 --- a/src/configs/zip-config.c +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2023 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include - -#include "xnnpack/common.h" -#include "xnnpack/config.h" -#include "xnnpack/init-once.h" -#include "xnnpack/microfnptr.h" -#include "xnnpack/zip.h" - -static struct xnn_zip_config x8_zip_config = {0}; -static struct xnn_zip_config x32_zip_config = {0}; - -XNN_INIT_ONCE_GUARD(x8_zip); -XNN_INIT_ONCE_GUARD(x32_zip); - -static void init_x8_zip_config(void) { - #if XNN_ARCH_ARM - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_arm_neon) { - x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__neon; - x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__neon; - x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__neon; - x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__neon; - } else if (!XNN_PLATFORM_MOBILE) { - x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__scalar; - x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__scalar; - x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__scalar; - x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__scalar; - } - #elif XNN_ARCH_ARM64 - x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__neon; - x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__neon; - x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__neon; - x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__neon; - #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 - x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__sse2; - x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__sse2; - x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__sse2; - x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__sse2; - #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__scalar; - x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__scalar; - x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__scalar; - x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__scalar; - #else - x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__scalar; - x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__scalar; - x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__scalar; - x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__scalar; - #endif - -} - -static void init_x32_zip_config(void) { - #if XNN_ARCH_ARM - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - assert(hardware_config != NULL); - if (hardware_config->use_arm_neon) { - x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__neon; - x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__neon; - x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__neon; - x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__neon; - } else if (!XNN_PLATFORM_MOBILE) { - x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__scalar; - x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__scalar; - x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__scalar; - x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__scalar; - } - #elif XNN_ARCH_ARM64 - x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__neon; - x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__neon; - x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__neon; - x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__neon; - #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 - x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__sse2; - x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__sse2; - x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__sse2; - x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__sse2; - #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD - x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__wasmsimd; - x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__wasmsimd; - x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__wasmsimd; - x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__wasmsimd; - #else - x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__scalar; - x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__scalar; - x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__scalar; - x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__scalar; - #endif - -} - -const struct xnn_zip_config* xnn_init_x8_zip_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - if (hardware_config == NULL) { - return NULL; - } - XNN_INIT_ONCE(x8_zip); - return &x8_zip_config; -} - -const struct xnn_zip_config* xnn_init_x32_zip_config() { - const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); - if (hardware_config == NULL) { - return NULL; - } - XNN_INIT_ONCE(x32_zip); - return &x32_zip_config; -} diff --git a/src/operator-run.c b/src/operator-run.c index 4dc666279f4..30616f8576e 100644 --- a/src/operator-run.c +++ b/src/operator-run.c @@ -1958,26 +1958,6 @@ void xnn_compute_elementwise_binary_5d( context->ukernel(context->elements, a, b, y, &context->params); } -void xnn_compute_channel_shuffle_fixed( - const struct channel_shuffle_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t index) -{ - const void* x = (const void*) ((uintptr_t) context->x + index * context->x_stride); - void* y = (void*) ((uintptr_t) context->y + index * context->y_stride); - - context->fixed_ukernel(context->n, x, y); -} - -void xnn_compute_channel_shuffle_variable( - const struct channel_shuffle_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t index) -{ - const void* x = (const void*) ((uintptr_t) context->x + index * context->x_stride); - void* y = (void*) ((uintptr_t) context->y + index * context->y_stride); - - context->variable_ukernel(context->n, context->m, x, y); -} - void xnn_compute_lut_strided( const struct lut_strided_context context[restrict XNN_MIN_ELEMENTS(1)], size_t batch_index) diff --git a/src/operators/channel-shuffle-nc.c b/src/operators/channel-shuffle-nc.c deleted file mode 100644 index 78ab65d8d70..00000000000 --- a/src/operators/channel-shuffle-nc.c +++ /dev/null @@ -1,311 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include -#include -#include -#include - -#include "xnnpack.h" -#include "xnnpack/allocator.h" -#include "xnnpack/common.h" -#include "xnnpack/compute.h" -#include "xnnpack/config-types.h" -#include "xnnpack/config.h" -#include "xnnpack/log.h" -#include "xnnpack/operator-type.h" -#include "xnnpack/operator.h" -#include "xnnpack/params.h" -#include "pthreadpool.h" - -static enum xnn_status create_channel_shuffle_nc( - size_t groups, - size_t group_channels, - size_t input_stride, - size_t output_stride, - uint32_t flags, - const struct xnn_zip_config* zip_config, - enum xnn_operator_type operator_type, - xnn_operator_t* channel_shuffle_op_out) -{ - xnn_operator_t channel_shuffle_op = NULL; - enum xnn_status status = xnn_status_uninitialized; - - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error("failed to create %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(operator_type)); - goto error; - } - - status = xnn_status_invalid_parameter; - - if (groups <= 1) { - xnn_log_error( - "failed to create %s operator with %zu groups: at least two groups required", - xnn_operator_type_to_string(operator_type), groups); - goto error; - } - - if (group_channels == 0) { - xnn_log_error( - "failed to create %s operator with %zu group channels: number of group channels must be non-zero", - xnn_operator_type_to_string(operator_type), group_channels); - goto error; - } - - const size_t channels = groups * group_channels; - if (input_stride < channels) { - xnn_log_error( - "failed to create %s operator with input element stride of %zu: " - "stride must be at least as large as the number of channels (%zux%zu)", - xnn_operator_type_to_string(operator_type), input_stride, groups, group_channels); - goto error; - } - - if (output_stride < channels) { - xnn_log_error( - "failed to create %s operator with output element stride of %zu: " - "stride must be at least as large as the number of channels (%zux%zu)", - xnn_operator_type_to_string(operator_type), output_stride, groups, group_channels); - goto error; - } - - status = xnn_status_out_of_memory; - - channel_shuffle_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator)); - if (channel_shuffle_op == NULL) { - xnn_log_error( - "failed to allocate %zu bytes for %s operator descriptor", - sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type)); - goto error; - } - - channel_shuffle_op->groups = groups; - channel_shuffle_op->group_channels = group_channels; - channel_shuffle_op->input_pixel_stride = input_stride; - channel_shuffle_op->output_pixel_stride = output_stride; - - channel_shuffle_op->type = operator_type; - channel_shuffle_op->flags = flags; - channel_shuffle_op->zip_config = zip_config; - - channel_shuffle_op->state = xnn_run_state_invalid; - - *channel_shuffle_op_out = channel_shuffle_op; - return xnn_status_success; - -error: - xnn_delete_operator(channel_shuffle_op); - return status; -} - - -enum xnn_status xnn_create_channel_shuffle_nc_x8( - size_t groups, - size_t group_channels, - size_t input_stride, - size_t output_stride, - uint32_t flags, - xnn_operator_t* channel_shuffle_op_out) -{ - const struct xnn_zip_config* zip_config = xnn_init_x8_zip_config(); - assert(zip_config != NULL); - return create_channel_shuffle_nc( - groups, - group_channels, - input_stride, - output_stride, - flags, - zip_config, - xnn_operator_type_channel_shuffle_nc_x8, - channel_shuffle_op_out); -} - -enum xnn_status xnn_create_channel_shuffle_nc_x32( - size_t groups, - size_t group_channels, - size_t input_stride, - size_t output_stride, - uint32_t flags, - xnn_operator_t* channel_shuffle_op_out) -{ - const struct xnn_zip_config* zip_config = xnn_init_x32_zip_config(); - if (zip_config == NULL) { - xnn_log_error( - "failed to create %s operator: unsupported hardware configuration", - xnn_operator_type_to_string(xnn_operator_type_channel_shuffle_nc_x32)); - return xnn_status_unsupported_hardware; - } - return create_channel_shuffle_nc( - groups, - group_channels, - input_stride, - output_stride, - flags, - zip_config, - xnn_operator_type_channel_shuffle_nc_x32, - channel_shuffle_op_out); -} - -static enum xnn_status reshape_channel_shuffle_nc( - xnn_operator_t channel_shuffle_op, - size_t batch_size, - uint32_t log2_element_size, - const struct xnn_zip_config zip[restrict XNN_MIN_ELEMENTS(1)]) -{ - channel_shuffle_op->state = xnn_run_state_invalid; - - if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) { - xnn_log_error("failed to reshape %s operator: XNNPACK is not initialized", - xnn_operator_type_to_string(channel_shuffle_op->type)); - return xnn_status_uninitialized; - } - - if (batch_size == 0) { - channel_shuffle_op->state = xnn_run_state_skip; - return xnn_status_success; - } - - channel_shuffle_op->batch_size = batch_size; - - const size_t groups = channel_shuffle_op->groups; - channel_shuffle_op->context.channel_shuffle = (struct channel_shuffle_context) { - .x_stride = channel_shuffle_op->input_pixel_stride << log2_element_size, - .y_stride = channel_shuffle_op->output_pixel_stride << log2_element_size, - .n = channel_shuffle_op->group_channels << log2_element_size, - .m = groups, - }; - channel_shuffle_op->compute[0].type = xnn_parallelization_type_1d; - channel_shuffle_op->compute[0].range[0] = batch_size; - switch (groups) { - case 2: - channel_shuffle_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_channel_shuffle_fixed; - channel_shuffle_op->context.channel_shuffle.fixed_ukernel = zip->x2; - break; - case 3: - channel_shuffle_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_channel_shuffle_fixed; - channel_shuffle_op->context.channel_shuffle.fixed_ukernel = zip->x3; - break; - case 4: - channel_shuffle_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_channel_shuffle_fixed; - channel_shuffle_op->context.channel_shuffle.fixed_ukernel = zip->x4; - break; - default: - channel_shuffle_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_channel_shuffle_variable; - channel_shuffle_op->context.channel_shuffle.variable_ukernel = zip->xm; - break; - case 0: - case 1: - XNN_UNREACHABLE; - } - channel_shuffle_op->state = xnn_run_state_needs_setup; - - return xnn_status_success; -} - -enum xnn_status xnn_reshape_channel_shuffle_nc_x8( - xnn_operator_t channel_shuffle_op, - size_t batch_size, - pthreadpool_t threadpool) -{ - if (channel_shuffle_op->type != xnn_operator_type_channel_shuffle_nc_x8) { - xnn_log_error("failed to reshape operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(xnn_operator_type_channel_shuffle_nc_x8), - xnn_operator_type_to_string(channel_shuffle_op->type)); - return xnn_status_invalid_parameter; - } - - return reshape_channel_shuffle_nc( - channel_shuffle_op, - batch_size, - /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT8_T, - channel_shuffle_op->zip_config); -} - -enum xnn_status xnn_reshape_channel_shuffle_nc_x32( - xnn_operator_t channel_shuffle_op, - size_t batch_size, - pthreadpool_t threadpool) -{ - if (channel_shuffle_op->type != xnn_operator_type_channel_shuffle_nc_x32) { - xnn_log_error("failed to reshape operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(xnn_operator_type_channel_shuffle_nc_x32), - xnn_operator_type_to_string(channel_shuffle_op->type)); - return xnn_status_invalid_parameter; - } - - return reshape_channel_shuffle_nc( - channel_shuffle_op, - batch_size, - /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT32_T, - channel_shuffle_op->zip_config); -} - -static enum xnn_status setup_channel_shuffle_nc( - xnn_operator_t channel_shuffle_op, - const void* input, - void* output) -{ - switch (channel_shuffle_op->state) { - case xnn_run_state_skip: - return xnn_status_success; - case xnn_run_state_invalid: - xnn_log_error( - "failed to setup %s operator: operator has not been reshaped yet", - xnn_operator_type_to_string(channel_shuffle_op->type)); - return xnn_status_invalid_state; - case xnn_run_state_needs_setup: - // Operator has been reshaped, but not setup, continue with setup. - case xnn_run_state_ready: - // Operator has been reshaped, and we are setting up with different pointers. - break; - } - - channel_shuffle_op->context.channel_shuffle.x = input; - channel_shuffle_op->context.channel_shuffle.y = output; - - channel_shuffle_op->state = xnn_run_state_ready; - - return xnn_status_success; -} - -enum xnn_status xnn_setup_channel_shuffle_nc_x8( - xnn_operator_t channel_shuffle_op, - const void* input, - void* output) -{ - if (channel_shuffle_op->type != xnn_operator_type_channel_shuffle_nc_x8) { - xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(xnn_operator_type_channel_shuffle_nc_x8), - xnn_operator_type_to_string(channel_shuffle_op->type)); - return xnn_status_invalid_parameter; - } - - return setup_channel_shuffle_nc( - channel_shuffle_op, - input, - output); -} - -enum xnn_status xnn_setup_channel_shuffle_nc_x32( - xnn_operator_t channel_shuffle_op, - const void* input, - void* output) -{ - if (channel_shuffle_op->type != xnn_operator_type_channel_shuffle_nc_x32) { - xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)", - xnn_operator_type_to_string(xnn_operator_type_channel_shuffle_nc_x32), - xnn_operator_type_to_string(channel_shuffle_op->type)); - return xnn_status_invalid_parameter; - } - - return setup_channel_shuffle_nc( - channel_shuffle_op, - input, - output); -} diff --git a/src/x32-zip/x32-zip-x2-neon.c b/src/x32-zip/x32-zip-x2-neon.c deleted file mode 100644 index d56f32fa046..00000000000 --- a/src/x32-zip/x32-zip-x2-neon.c +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x2_ukernel__neon( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - - const uint32_t* x = input; - const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); - uint32_t* o = output; - - while (n >= 16) { - uint32x4x2_t vxy; - vxy.val[0] = vld1q_u32(x); x += 4; - vxy.val[1] = vld1q_u32(y); y += 4; - vst2q_u32(o, vxy); o += 8; - n -= 16; - } - if XNN_UNLIKELY(n != 0) { - if (n & 8) { - uint32x2x2_t vxy; - vxy.val[0] = vld1_u32(x); x += 2; - vxy.val[1] = vld1_u32(y); y += 2; - vst2_u32(o, vxy); o += 4; - } - if (n & 4) { - uint32x2_t vxy = vld1_dup_u32(x); - vxy = vld1_lane_u32(y, vxy, 1); - vst1_u32(o, vxy); - } - } -} diff --git a/src/x32-zip/x32-zip-x2-scalar.c b/src/x32-zip/x32-zip-x2-scalar.c deleted file mode 100644 index f6e3c86b1f4..00000000000 --- a/src/x32-zip/x32-zip-x2-scalar.c +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x2_ukernel__scalar( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - - const uint32_t* x = input; - const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); - - do { - const uint32_t vx = *x++; - const uint32_t vy = *y++; - output[0] = vx; - output[1] = vy; - output += 2; - - n -= 4; - } while (n != 0); -} diff --git a/src/x32-zip/x32-zip-x2-sse2.c b/src/x32-zip/x32-zip-x2-sse2.c deleted file mode 100644 index 548976c41ff..00000000000 --- a/src/x32-zip/x32-zip-x2-sse2.c +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x2_ukernel__sse2( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - - const uint32_t* x = input; - const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); - uint32_t* o = output; - - while (n >= 16) { - const __m128i vx = _mm_loadu_si128((const __m128i*) x); - x += 4; - const __m128i vy = _mm_loadu_si128((const __m128i*) y); - y += 4; - const __m128i vxy_lo = _mm_unpacklo_epi32(vx, vy); - const __m128i vxy_hi = _mm_unpackhi_epi32(vx, vy); - _mm_storeu_si128((__m128i*) o, vxy_lo); - _mm_storeu_si128((__m128i*) (o + 4), vxy_hi); - o += 8; - n -= 16; - } - if XNN_UNLIKELY(n != 0) { - if (n & 8) { - const __m128i vx = _mm_loadl_epi64((const __m128i*) x); - x += 2; - const __m128i vy = _mm_loadl_epi64((const __m128i*) y); - y += 2; - const __m128i vxy = _mm_unpacklo_epi32(vx, vy); - _mm_storeu_si128((__m128i*) o, vxy); - o += 4; - } - if (n & 4) { - const uint32_t vx = *x; - const uint32_t vy = *y; - o[0] = vx; - o[1] = vy; - } - } -} diff --git a/src/x32-zip/x32-zip-x2-wasmsimd.c b/src/x32-zip/x32-zip-x2-wasmsimd.c deleted file mode 100644 index f2478be361b..00000000000 --- a/src/x32-zip/x32-zip-x2-wasmsimd.c +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x2_ukernel__wasmsimd( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % sizeof(uint32_t) == 0); - - const float* x = (const float*) input; - const float* y = (const float*) ((uintptr_t) x + n); - float* o = (float*) output; - - while (n >= 4 * sizeof(uint32_t)) { - const v128_t vx = wasm_v128_load(x); - x += 4; - const v128_t vy = wasm_v128_load(y); - y += 4; - const v128_t vxy_lo = wasm_v32x4_shuffle(vx, vy, 0, 4, 1, 5); - const v128_t vxy_hi = wasm_v32x4_shuffle(vx, vy, 2, 6, 3, 7); - wasm_v128_store(o, vxy_lo); - wasm_v128_store(o + 4, vxy_hi); - o += 8; - n -= 4 * sizeof(uint32_t); - } - if XNN_UNLIKELY(n != 0) { - if (n & (2 * sizeof(uint32_t))) { - const double vx = *((const double*) x); - x += 2; - const double vy = *((const double*) y); - y += 2; - const v128_t vxy = wasm_f64x2_make(vx, vy); - wasm_v128_store(o, wasm_v32x4_shuffle(vxy, vxy, 0, 2, 1, 3)); - o += 4; - } - if (n & (1 * sizeof(uint32_t))) { - const float vx = *x; - const float vy = *y; - o[0] = vx; - o[1] = vy; - } - } -} diff --git a/src/x32-zip/x32-zip-x3-neon.c b/src/x32-zip/x32-zip-x3-neon.c deleted file mode 100644 index 8ca3baa1291..00000000000 --- a/src/x32-zip/x32-zip-x3-neon.c +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x3_ukernel__neon( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - - const uint32_t* x = input; - const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); - const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n); - uint32_t* o = output; - - while (n >= 16) { - uint32x4x3_t vxyz; - vxyz.val[0] = vld1q_u32(x); x += 4; - vxyz.val[1] = vld1q_u32(y); y += 4; - vxyz.val[2] = vld1q_u32(z); z += 4; - vst3q_u32(o, vxyz); o += 12; - n -= 16; - } - if XNN_UNLIKELY(n != 0) { - if (n & 8) { - uint32x2x3_t vxyz; - vxyz.val[0] = vld1_u32(x); x += 2; - vxyz.val[1] = vld1_u32(y); y += 2; - vxyz.val[2] = vld1_u32(z); z += 2; - vst3_u32(o, vxyz); o += 6; - } - if (n & 4) { - uint32x2_t vxy = vld1_dup_u32(x); - const uint32x2_t vz = vld1_dup_u32(z); - vxy = vld1_lane_u32(y, vxy, 1); - vst1_u32(o, vxy); o += 2; - vst1_lane_u32(o, vz, 0); - } - } -} diff --git a/src/x32-zip/x32-zip-x3-scalar.c b/src/x32-zip/x32-zip-x3-scalar.c deleted file mode 100644 index 9a7cc7a93d9..00000000000 --- a/src/x32-zip/x32-zip-x3-scalar.c +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x3_ukernel__scalar( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - - const uint32_t* x = input; - const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); - const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n); - uint32_t* o = output; - - do { - const uint32_t vx = *x++; - const uint32_t vy = *y++; - const uint32_t vz = *z++; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o += 3; - - n -= 4; - } while (n != 0); -} diff --git a/src/x32-zip/x32-zip-x3-sse2.c b/src/x32-zip/x32-zip-x3-sse2.c deleted file mode 100644 index bef22257473..00000000000 --- a/src/x32-zip/x32-zip-x3-sse2.c +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x3_ukernel__sse2( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - - const float* x = (const float*) input; - const float* y = (const float*) ((uintptr_t) x + n); - const float* z = (const float*) ((uintptr_t) y + n); - float* o = (float*) output; - - while (n >= 16) { - // vx = ( x3, x2, x1, x0 ) - const __m128 vx = _mm_loadu_ps(x); - x += 4; - // vy = ( y3, y2, y1, y0 ) - const __m128 vy = _mm_loadu_ps(y); - y += 4; - // vz = ( z3, z2, z1, z0 ) - const __m128 vz = _mm_loadu_ps(z); - z += 4; - - // vxy = ( y2, y0, x2, x0 ) - const __m128 vxy = _mm_shuffle_ps(vx, vy, _MM_SHUFFLE(2, 0, 2, 0)); - // vyz = ( z3, z1, y3, y1 ) - const __m128 vyz = _mm_shuffle_ps(vy, vz, _MM_SHUFFLE(3, 1, 3, 1)); - // vzx = ( x3, x1, z2, z0 ) - const __m128 vzx = _mm_shuffle_ps(vz, vx, _MM_SHUFFLE(3, 1, 2, 0)); - - // vxyz0 = ( x1, z0, y0, x0 ) - const __m128 vxyz0 = _mm_shuffle_ps(vxy, vzx, _MM_SHUFFLE(2, 0, 2, 0)); - // vxyz1 = ( y2, x2, z1, y1 ) - const __m128 vxyz1 = _mm_shuffle_ps(vyz, vxy, _MM_SHUFFLE(3, 1, 2, 0)); - // vxyz2 = ( z3, y3, x3, z2 ) - const __m128 vxyz2 = _mm_shuffle_ps(vzx, vyz, _MM_SHUFFLE(3, 1, 3, 1)); - - _mm_storeu_ps(o, vxyz0); - _mm_storeu_ps(o + 4, vxyz1); - _mm_storeu_ps(o + 8, vxyz2); - o += 12; - n -= 16; - } - if XNN_UNLIKELY(n != 0) { - if (n & 8) { - // vx = ( -, -, x1, x0 ) - const __m128 vx = _mm_castpd_ps(_mm_load_sd((const double*) x)); - x += 2; - // vy = ( -, -, y1, y0 ) - const __m128 vy = _mm_castpd_ps(_mm_load_sd((const double*) y)); - y += 2; - // vz = ( -, -, z1, z0 ) - const __m128 vz = _mm_castpd_ps(_mm_load_sd((const double*) z)); - z += 2; - - // vxy = ( y1, x1, y0, x0 ) - const __m128 vxy = _mm_unpacklo_ps(vx, vy); - // vzx = ( x1, z1, x0, z0 ) - const __m128 vzx = _mm_unpacklo_ps(vz, vx); - // vyz = ( z1, y1, z0, y0 ) - const __m128 vyz = _mm_unpacklo_ps(vy, vz); - - _mm_storeu_ps(o, _mm_shuffle_ps(vxy, vzx, _MM_SHUFFLE(3, 0, 1, 0))); - _mm_storeh_pi((__m64*) (o + 4), vyz); - o += 6; - } - if (n & 4) { - const __m128 vx = _mm_load_ss(x); - const __m128 vy = _mm_load_ss(y); - const __m128 vz = _mm_load_ss(z); - _mm_store_ss(o, vx); - _mm_store_ss(o + 1, vy); - _mm_store_ss(o + 2, vz); - } - } -} diff --git a/src/x32-zip/x32-zip-x3-wasmsimd.c b/src/x32-zip/x32-zip-x3-wasmsimd.c deleted file mode 100644 index 3aac4b632c7..00000000000 --- a/src/x32-zip/x32-zip-x3-wasmsimd.c +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x3_ukernel__wasmsimd( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % sizeof(uint32_t) == 0); - - const float* x = (const float*) input; - const float* y = (const float*) ((uintptr_t) x + n); - const float* z = (const float*) ((uintptr_t) y + n); - float* o = (float*) output; - - while (n >= 4 * sizeof(uint32_t)) { - // vx = ( x3, x2, x1, x0 ) - const v128_t vx = wasm_v128_load(x); - x += 4; - // vy = ( y3, y2, y1, y0 ) - const v128_t vy = wasm_v128_load(y); - y += 4; - // vz = ( z3, z2, z1, z0 ) - const v128_t vz = wasm_v128_load(z); - z += 4; - - // vxy = ( y2, y0, x2, x0 ) - const v128_t vxy = wasm_v32x4_shuffle(vx, vy, 0, 2, 4, 6); - // vyz = ( z3, z1, y3, y1 ) - const v128_t vyz = wasm_v32x4_shuffle(vy, vz, 1, 3, 5, 7); - // vzx = ( x3, x1, z2, z0 ) - const v128_t vzx = wasm_v32x4_shuffle(vz, vx, 0, 2, 5, 7); - - // vxyz0 = ( x1, z0, y0, x0 ) - const v128_t vxyz0 = wasm_v32x4_shuffle(vxy, vzx, 0, 2, 4, 6); - // vxyz1 = ( y2, x2, z1, y1 ) - const v128_t vxyz1 = wasm_v32x4_shuffle(vyz, vxy, 0, 2, 5, 7); - // vxyz2 = ( z3, y3, x3, z2 ) - const v128_t vxyz2 = wasm_v32x4_shuffle(vzx, vyz, 1, 3, 5, 7); - - wasm_v128_store(o, vxyz0); - wasm_v128_store(o + 4, vxyz1); - wasm_v128_store(o + 8, vxyz2); - o += 12; - n -= 4 * sizeof(uint32_t); - } - if XNN_UNLIKELY(n != 0) { - do { - const float vx = *x++; - const float vy = *y++; - const float vz = *z++; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o += 3; - n -= sizeof(uint32_t); - } while (n != 0); - } -} diff --git a/src/x32-zip/x32-zip-x4-neon.c b/src/x32-zip/x32-zip-x4-neon.c deleted file mode 100644 index ef9f54b5afa..00000000000 --- a/src/x32-zip/x32-zip-x4-neon.c +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x4_ukernel__neon( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - - const uint32_t* x = input; - const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); - const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n); - const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n); - uint32_t* o = output; - - while (n >= 16) { - uint32x4x4_t vxyzw; - vxyzw.val[0] = vld1q_u32(x); x += 4; - vxyzw.val[1] = vld1q_u32(y); y += 4; - vxyzw.val[2] = vld1q_u32(z); z += 4; - vxyzw.val[3] = vld1q_u32(w); w += 4; - vst4q_u32(o, vxyzw); o += 16; - n -= 16; - } - if XNN_UNLIKELY(n != 0) { - if (n & 8) { - uint32x2x4_t vxyzw; - vxyzw.val[0] = vld1_u32(x); x += 2; - vxyzw.val[1] = vld1_u32(y); y += 2; - vxyzw.val[2] = vld1_u32(z); z += 2; - vxyzw.val[3] = vld1_u32(w); w += 2; - vst4_u32(o, vxyzw); o += 8; - } - if (n & 4) { - uint32x4_t vxyzw = vld1q_dup_u32(x); - vxyzw = vld1q_lane_u32(y, vxyzw, 1); - vxyzw = vld1q_lane_u32(z, vxyzw, 2); - vxyzw = vld1q_lane_u32(w, vxyzw, 3); - vst1q_u32(o, vxyzw); - } - } -} diff --git a/src/x32-zip/x32-zip-x4-scalar.c b/src/x32-zip/x32-zip-x4-scalar.c deleted file mode 100644 index 73b36443c1b..00000000000 --- a/src/x32-zip/x32-zip-x4-scalar.c +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x4_ukernel__scalar( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - - const uint32_t* x = input; - const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); - const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n); - const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n); - uint32_t* o = output; - - do { - const uint32_t vx = *x++; - const uint32_t vy = *y++; - const uint32_t vz = *z++; - const uint32_t vw = *w++; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o[3] = vw; - o += 4; - - n -= 4; - } while (n != 0); -} diff --git a/src/x32-zip/x32-zip-x4-sse2.c b/src/x32-zip/x32-zip-x4-sse2.c deleted file mode 100644 index 82245e7b8d8..00000000000 --- a/src/x32-zip/x32-zip-x4-sse2.c +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x4_ukernel__sse2( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - - const uint32_t* x = input; - const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n); - const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n); - const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n); - uint32_t* o = output; - - while (n >= 16) { - const __m128i vx = _mm_loadu_si128((const __m128i*) x); - x += 4; - const __m128i vy = _mm_loadu_si128((const __m128i*) y); - y += 4; - const __m128i vz = _mm_loadu_si128((const __m128i*) z); - z += 4; - const __m128i vw = _mm_loadu_si128((const __m128i*) w); - w += 4; - - const __m128i vxy_lo = _mm_unpacklo_epi32(vx, vy); - const __m128i vxy_hi = _mm_unpackhi_epi32(vx, vy); - const __m128i vzw_lo = _mm_unpacklo_epi32(vz, vw); - const __m128i vzw_hi = _mm_unpackhi_epi32(vz, vw); - - const __m128i vxyzw0 = _mm_unpacklo_epi64(vxy_lo, vzw_lo); - const __m128i vxyzw1 = _mm_unpackhi_epi64(vxy_lo, vzw_lo); - const __m128i vxyzw2 = _mm_unpacklo_epi64(vxy_hi, vzw_hi); - const __m128i vxyzw3 = _mm_unpackhi_epi64(vxy_hi, vzw_hi); - - _mm_storeu_si128((__m128i*) o, vxyzw0); - _mm_storeu_si128((__m128i*) (o + 4), vxyzw1); - _mm_storeu_si128((__m128i*) (o + 8), vxyzw2); - _mm_storeu_si128((__m128i*) (o + 12), vxyzw3); - o += 16; - n -= 16; - } - if XNN_UNLIKELY(n != 0) { - if (n & 8) { - const __m128i vx = _mm_loadl_epi64((const __m128i*) x); - x += 2; - const __m128i vy = _mm_loadl_epi64((const __m128i*) y); - y += 2; - const __m128i vz = _mm_loadl_epi64((const __m128i*) z); - z += 2; - const __m128i vw = _mm_loadl_epi64((const __m128i*) w); - w += 2; - - const __m128i vxy = _mm_unpacklo_epi32(vx, vy); - const __m128i vzw = _mm_unpacklo_epi32(vz, vw); - - const __m128i vxyzw_lo = _mm_unpacklo_epi64(vxy, vzw); - const __m128i vxyzw_hi = _mm_unpackhi_epi64(vxy, vzw); - - _mm_storeu_si128((__m128i*) o, vxyzw_lo); - _mm_storeu_si128((__m128i*) (o + 4), vxyzw_hi); - o += 8; - } - if (n & 4) { - const uint32_t vx = *x; - const uint32_t vy = *y; - const uint32_t vz = *z; - const uint32_t vw = *w; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o[3] = vw; - } - } -} diff --git a/src/x32-zip/x32-zip-x4-wasmsimd.c b/src/x32-zip/x32-zip-x4-wasmsimd.c deleted file mode 100644 index 74dd3599461..00000000000 --- a/src/x32-zip/x32-zip-x4-wasmsimd.c +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_x4_ukernel__wasmsimd( - size_t n, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % sizeof(uint32_t) == 0); - - const float* x = (const float*) input; - const float* y = (const float*) ((uintptr_t) x + n); - const float* z = (const float*) ((uintptr_t) y + n); - const float* w = (const float*) ((uintptr_t) z + n); - float* o = (float*) output; - - while (n >= 4 * sizeof(uint32_t)) { - const v128_t vx = wasm_v128_load(x); - x += 4; - const v128_t vy = wasm_v128_load(y); - y += 4; - const v128_t vz = wasm_v128_load(z); - z += 4; - const v128_t vw = wasm_v128_load(w); - w += 4; - - const v128_t vxy_lo = wasm_v32x4_shuffle(vx, vy, 0, 4, 1, 5); - const v128_t vxy_hi = wasm_v32x4_shuffle(vx, vy, 2, 6, 3, 7); - const v128_t vzw_lo = wasm_v32x4_shuffle(vz, vw, 0, 4, 1, 5); - const v128_t vzw_hi = wasm_v32x4_shuffle(vz, vw, 2, 6, 3, 7); - - const v128_t vxyzw0 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 0, 1, 4, 5); - const v128_t vxyzw1 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 2, 3, 6, 7); - const v128_t vxyzw2 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 0, 1, 4, 5); - const v128_t vxyzw3 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 2, 3, 6, 7); - - wasm_v128_store(o, vxyzw0); - wasm_v128_store(o + 4, vxyzw1); - wasm_v128_store(o + 8, vxyzw2); - wasm_v128_store(o + 12, vxyzw3); - o += 16; - n -= 4 * sizeof(uint32_t); - } - if XNN_UNLIKELY(n != 0) { - if (n & (2 * sizeof(uint32_t))) { - const double vx = *((const double*) x); - x += 2; - const double vy = *((const double*) y); - y += 2; - const double vz = *((const double*) z); - z += 2; - const double vw = *((const double*) w); - w += 2; - - const v128_t vxy = wasm_f64x2_make(vx, vy); - const v128_t vzw = wasm_f64x2_make(vz, vw); - - const v128_t vxyzw_lo = wasm_v32x4_shuffle(vxy, vzw, 0, 2, 4, 6); - const v128_t vxyzw_hi = wasm_v32x4_shuffle(vxy, vzw, 1, 3, 5, 7); - - wasm_v128_store(o, vxyzw_lo); - wasm_v128_store(o + 4, vxyzw_hi); - o += 8; - } - if (n & (1 * sizeof(uint32_t))) { - const float vx = *x; - const float vy = *y; - const float vz = *z; - const float vw = *w; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o[3] = vw; - } - } -} diff --git a/src/x32-zip/x32-zip-xm-neon.c b/src/x32-zip/x32-zip-xm-neon.c deleted file mode 100644 index 13c56e6ced7..00000000000 --- a/src/x32-zip/x32-zip-xm-neon.c +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_xm_ukernel__neon( - size_t n, - size_t m, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - assert(m >= 4); - - const uint32_t* w = input; - const size_t group_increment = m * 4; - const size_t input_increment = n * 3; - const size_t output_increment = 16 - m * n; - const uint32_t* last_input = (const uint32_t*) ((uintptr_t) input + n * (m - 1)); - uint32_t* last_output = (uint32_t*) ((uintptr_t) output + (m * 4 - 16)); - - for (size_t i = 0; i < m; i += 4) { - w = (const uint32_t*) ((uintptr_t) w + input_increment); - if (w >= last_input) { - w = last_input; - } - const uint32_t* z = (const uint32_t*) ((uintptr_t) w - n); - const uint32_t* y = (const uint32_t*) ((uintptr_t) z - n); - const uint32_t* x = (const uint32_t*) ((uintptr_t) y - n); - - size_t k = n; - while (k >= 16) { - const uint32x4_t vx = vld1q_u32(x); x += 4; - const uint32x4_t vy = vld1q_u32(y); y += 4; - const uint32x4_t vz = vld1q_u32(z); z += 4; - const uint32x4_t vw = vld1q_u32(w); w += 4; - - const uint32x4x2_t vxy = vzipq_u32(vx, vy); - const uint32x4x2_t vzw = vzipq_u32(vz, vw); - - vst1_u32(output, vget_low_u32(vxy.val[0])); - vst1_u32(output + 2, vget_low_u32(vzw.val[0])); - output = (uint32_t*) ((uintptr_t) output + group_increment); - - vst1_u32(output, vget_high_u32(vxy.val[0])); - vst1_u32(output + 2, vget_high_u32(vzw.val[0])); - output = (uint32_t*) ((uintptr_t) output + group_increment); - - vst1_u32(output, vget_low_u32(vxy.val[1])); - vst1_u32(output + 2, vget_low_u32(vzw.val[1])); - output = (uint32_t*) ((uintptr_t) output + group_increment); - - vst1_u32(output, vget_high_u32(vxy.val[1])); - vst1_u32(output + 2, vget_high_u32(vzw.val[1])); - output = (uint32_t*) ((uintptr_t) output + group_increment); - - k -= 16; - } - if XNN_UNLIKELY(k != 0) { - if (k & 8) { - const uint32x2_t vx = vld1_u32(x); x += 2; - const uint32x2_t vy = vld1_u32(y); y += 2; - const uint32x2_t vz = vld1_u32(z); z += 2; - const uint32x2_t vw = vld1_u32(w); w += 2; - - const uint32x2x2_t vxy = vzip_u32(vx, vy); - const uint32x2x2_t vzw = vzip_u32(vz, vw); - - vst1_u32(output, vxy.val[0]); - vst1_u32(output + 2, vzw.val[0]); - output = (uint32_t*) ((uintptr_t) output + group_increment); - - vst1_u32(output, vxy.val[1]); - vst1_u32(output + 2, vzw.val[1]); - output = (uint32_t*) ((uintptr_t) output + group_increment); - } - if (k & 4) { - const uint32x2_t vx = vld1_dup_u32(x); - const uint32x2_t vz = vld1_dup_u32(z); - const uint32x2_t vxy = vld1_lane_u32(y, vx, 1); - const uint32x2_t vzw = vld1_lane_u32(w, vz, 1); w += 1; - - vst1_u32(output, vxy); - vst1_u32(output + 2, vzw); - output = (uint32_t*) ((uintptr_t) output + group_increment); - } - } - output = (uint32_t*) ((uintptr_t) output + output_increment); - if (output > last_output) { - output = last_output; - } - } -} diff --git a/src/x32-zip/x32-zip-xm-scalar.c b/src/x32-zip/x32-zip-xm-scalar.c deleted file mode 100644 index 5d29999fa50..00000000000 --- a/src/x32-zip/x32-zip-xm-scalar.c +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_xm_ukernel__scalar( - size_t n, - size_t m, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - assert(m >= 4); - - size_t k = n; - do { - size_t l = m; - const uint32_t* input_column = input++; - do { - *output++ = *input_column; - input_column = (uint32_t*) ((uintptr_t) input_column + n); - } while (--l != 0); - k -= 4; - } while (k != 0); -} diff --git a/src/x32-zip/x32-zip-xm-sse2.c b/src/x32-zip/x32-zip-xm-sse2.c deleted file mode 100644 index e5734c6732b..00000000000 --- a/src/x32-zip/x32-zip-xm-sse2.c +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_xm_ukernel__sse2( - size_t n, - size_t m, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % 4 == 0); - assert(m >= 4); - - const uint32_t* w = input; - const size_t group_increment = m * 4; - const size_t input_increment = n * 3; - const size_t output_increment = 16 - m * n; - const uint32_t* last_input = (const uint32_t*) ((uintptr_t) input + n * (m - 1)); - uint32_t* last_output = (uint32_t*) ((uintptr_t) output + (m * 4 - 16)); - - for (size_t i = 0; i < m; i += 4) { - w = (const uint32_t*) ((uintptr_t) w + input_increment); - if (w >= last_input) { - w = last_input; - } - const uint32_t* z = (const uint32_t*) ((uintptr_t) w - n); - const uint32_t* y = (const uint32_t*) ((uintptr_t) z - n); - const uint32_t* x = (const uint32_t*) ((uintptr_t) y - n); - - size_t k = n; - while (k >= 16) { - const __m128i vx = _mm_loadu_si128((const __m128i*) x); - x += 4; - const __m128i vy = _mm_loadu_si128((const __m128i*) y); - y += 4; - const __m128i vz = _mm_loadu_si128((const __m128i*) z); - z += 4; - const __m128i vw = _mm_loadu_si128((const __m128i*) w); - w += 4; - - const __m128i vxy_lo = _mm_unpacklo_epi32(vx, vy); - const __m128i vxy_hi = _mm_unpackhi_epi32(vx, vy); - const __m128i vzw_lo = _mm_unpacklo_epi32(vz, vw); - const __m128i vzw_hi = _mm_unpackhi_epi32(vz, vw); - - const __m128i vxyzw0 = _mm_unpacklo_epi64(vxy_lo, vzw_lo); - const __m128i vxyzw1 = _mm_unpackhi_epi64(vxy_lo, vzw_lo); - const __m128i vxyzw2 = _mm_unpacklo_epi64(vxy_hi, vzw_hi); - const __m128i vxyzw3 = _mm_unpackhi_epi64(vxy_hi, vzw_hi); - - _mm_storeu_si128((__m128i*) output, vxyzw0); - output = (uint32_t*) ((uintptr_t) output + group_increment); - - _mm_storeu_si128((__m128i*) output, vxyzw1); - output = (uint32_t*) ((uintptr_t) output + group_increment); - - _mm_storeu_si128((__m128i*) output, vxyzw2); - output = (uint32_t*) ((uintptr_t) output + group_increment); - - _mm_storeu_si128((__m128i*) output, vxyzw3); - output = (uint32_t*) ((uintptr_t) output + group_increment); - - k -= 16; - } - if XNN_UNLIKELY(k != 0) { - if (k & 8) { - const __m128i vx = _mm_loadl_epi64((const __m128i*) x); - x += 2; - const __m128i vy = _mm_loadl_epi64((const __m128i*) y); - y += 2; - const __m128i vz = _mm_loadl_epi64((const __m128i*) z); - z += 2; - const __m128i vw = _mm_loadl_epi64((const __m128i*) w); - w += 2; - - const __m128i vxy = _mm_unpacklo_epi32(vx, vy); - const __m128i vzw = _mm_unpacklo_epi32(vz, vw); - - const __m128i vxyzw_lo = _mm_unpacklo_epi64(vxy, vzw); - const __m128i vxyzw_hi = _mm_unpackhi_epi64(vxy, vzw); - - _mm_storeu_si128((__m128i*) output, vxyzw_lo); - output = (uint32_t*) ((uintptr_t) output + group_increment); - - _mm_storeu_si128((__m128i*) output, vxyzw_hi); - output = (uint32_t*) ((uintptr_t) output + group_increment); - } - if (k & 4) { - const uint32_t vx = *x; - const uint32_t vy = *y; - const uint32_t vz = *z; - const uint32_t vw = *w++; - - output[0] = vx; - output[1] = vy; - output[2] = vz; - output[3] = vw; - output = (uint32_t*) ((uintptr_t) output + group_increment); - } - } - output = (uint32_t*) ((uintptr_t) output + output_increment); - if (output > last_output) { - output = last_output; - } - } -} diff --git a/src/x32-zip/x32-zip-xm-wasmsimd.c b/src/x32-zip/x32-zip-xm-wasmsimd.c deleted file mode 100644 index 69d86ce317c..00000000000 --- a/src/x32-zip/x32-zip-xm-wasmsimd.c +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright 2020 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include - -#include "xnnpack/zip.h" - - -void xnn_x32_zip_xm_ukernel__wasmsimd( - size_t n, - size_t m, - const uint32_t* input, - uint32_t* output) -{ - assert(n != 0); - assert(n % sizeof(uint32_t) == 0); - assert(m >= 4); - - const float* w = (const float*) input; - float* o = (float*) output; - const size_t group_increment = m * 4; - const size_t input_increment = n * 3; - const size_t output_increment = 4 * sizeof(uint32_t) - m * n; - const float* last_input = (const float*) ((uintptr_t) input + n * (m - 1)); - float* last_output = (float*) ((uintptr_t) output + (m * 4 - 4 * sizeof(uint32_t))); - - for (size_t i = 0; i < m; i += 4) { - w = (const float*) ((uintptr_t) w + input_increment); - if (w >= last_input) { - w = last_input; - } - const float* z = (const float*) ((uintptr_t) w - n); - const float* y = (const float*) ((uintptr_t) z - n); - const float* x = (const float*) ((uintptr_t) y - n); - - size_t k = n; - while (k >= 4 * sizeof(uint32_t)) { - const v128_t vx = wasm_v128_load((const v128_t*) x); - x += 4; - const v128_t vy = wasm_v128_load((const v128_t*) y); - y += 4; - const v128_t vz = wasm_v128_load((const v128_t*) z); - z += 4; - const v128_t vw = wasm_v128_load((const v128_t*) w); - w += 4; - - const v128_t vxy_lo = wasm_v32x4_shuffle(vx, vy, 0, 4, 1, 5); - const v128_t vxy_hi = wasm_v32x4_shuffle(vx, vy, 2, 6, 3, 7); - const v128_t vzw_lo = wasm_v32x4_shuffle(vz, vw, 0, 4, 1, 5); - const v128_t vzw_hi = wasm_v32x4_shuffle(vz, vw, 2, 6, 3, 7); - - const v128_t vxyzw0 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 0, 1, 4, 5); - const v128_t vxyzw1 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 2, 3, 6, 7); - const v128_t vxyzw2 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 0, 1, 4, 5); - const v128_t vxyzw3 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 2, 3, 6, 7); - - wasm_v128_store(o, vxyzw0); - o = (float*) ((uintptr_t) o + group_increment); - - wasm_v128_store(o, vxyzw1); - o = (float*) ((uintptr_t) o + group_increment); - - wasm_v128_store(o, vxyzw2); - o = (float*) ((uintptr_t) o + group_increment); - - wasm_v128_store(o, vxyzw3); - o = (float*) ((uintptr_t) o + group_increment); - - k -= 4 * sizeof(uint32_t); - } - if XNN_UNLIKELY(k != 0) { - if (k & (2 * sizeof(uint32_t))) { - const double vx = *((const double*) x); - x += 2; - const double vy = *((const double*) y); - y += 2; - const double vz = *((const double*) z); - z += 2; - const double vw = *((const double*) w); - w += 2; - - const v128_t vxy = wasm_f64x2_make(vx, vy); - const v128_t vzw = wasm_f64x2_make(vz, vw); - - const v128_t vxyzw_lo = wasm_v32x4_shuffle(vxy, vzw, 0, 2, 4, 6); - const v128_t vxyzw_hi = wasm_v32x4_shuffle(vxy, vzw, 1, 3, 5, 7); - - wasm_v128_store(o, vxyzw_lo); - o = (float*) ((uintptr_t) o + group_increment); - - wasm_v128_store(o, vxyzw_hi); - o = (float*) ((uintptr_t) o + group_increment); - } - if (k & (1 * sizeof(uint32_t))) { - const float vx = *x; - const float vy = *y; - const float vz = *z; - const float vw = *w++; - - o[0] = vx; - o[1] = vy; - o[2] = vz; - o[3] = vw; - o = (float*) ((uintptr_t) o + group_increment); - } - } - o = (float*) ((uintptr_t) o + output_increment); - if (o > last_output) { - o = last_output; - } - } -} diff --git a/src/x8-zip/x8-zip-x2-neon.c b/src/x8-zip/x8-zip-x2-neon.c deleted file mode 100644 index 19b9c97f663..00000000000 --- a/src/x8-zip/x8-zip-x2-neon.c +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_x2_ukernel__neon( - size_t n, - const uint8_t* input, - uint8_t* output) -{ - const uint8_t* x = input; - const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); - uint8_t* o = output; - - if (n >= 8) { - do { - uint8x8x2_t vxy; - vxy.val[0] = vld1_u8(x); x += 8; - vxy.val[1] = vld1_u8(y); y += 8; - vst2_u8(o, vxy); o += 16;; - n -= 8; - } while (n >= 8); - if (n != 0) { - const size_t address_increment = n - 8; - uint8x8x2_t vxy; - vxy.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment)); - vxy.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment)); - vst2_u8((uint8_t*) ((uintptr_t) o + address_increment * 2), vxy); - } - } else { - do { - const uint8_t vx = *x++; - const uint8_t vy = *y++; - o[0] = vx; - o[1] = vy; - o += 2; - } while (--n != 0); - } -} diff --git a/src/x8-zip/x8-zip-x2-scalar.c b/src/x8-zip/x8-zip-x2-scalar.c deleted file mode 100644 index a0ffcd24ce4..00000000000 --- a/src/x8-zip/x8-zip-x2-scalar.c +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_x2_ukernel__scalar( - size_t n, - const uint8_t* input, - uint8_t* output) -{ - assert(n != 0); - - const uint8_t* x = input; - const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); - uint8_t* o = output; - - do { - const uint8_t vx = *x++; - const uint8_t vy = *y++; - o[0] = vx; - o[1] = vy; - o += 2; - - n -= sizeof(uint8_t); - } while (n != 0); -} diff --git a/src/x8-zip/x8-zip-x2-sse2.c b/src/x8-zip/x8-zip-x2-sse2.c deleted file mode 100644 index 640832fa874..00000000000 --- a/src/x8-zip/x8-zip-x2-sse2.c +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_x2_ukernel__sse2( - size_t n, - const uint8_t* input, - uint8_t* output) -{ - const uint8_t* x = input; - const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); - uint8_t* o = output; - - if (n >= 16) { - do { - const __m128i vx = _mm_loadu_si128((const __m128i*) x); - x += 16; - const __m128i vy = _mm_loadu_si128((const __m128i*) y); - y += 16; - const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy); - const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy); - _mm_storeu_si128((__m128i*) o, vxy_lo); - _mm_storeu_si128((__m128i*) (o + 16), vxy_hi); - o = (void*) ((uintptr_t) o + 32); - n -= 16; - } while (n >= 16); - if (n != 0) { - const size_t address_increment = n - 16; - const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment)); - const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment)); - const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy); - const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy); - o = (void*) ((uintptr_t) o + address_increment * 2); - _mm_storeu_si128((__m128i*) o, vxy_lo); - _mm_storeu_si128((__m128i*) o + 1, vxy_hi); - } - } else { - do { - const uint8_t vx = *x++; - const uint8_t vy = *y++; - o[0] = vx; - o[1] = vy; - o += 2; - } while (--n != 0); - } -} diff --git a/src/x8-zip/x8-zip-x3-neon.c b/src/x8-zip/x8-zip-x3-neon.c deleted file mode 100644 index 6e947ebdee2..00000000000 --- a/src/x8-zip/x8-zip-x3-neon.c +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_x3_ukernel__neon( - size_t n, - const uint8_t* input, - uint8_t* output) -{ - const uint8_t* x = input; - const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); - const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); - uint8_t* o = output; - - if (n >= 8) { - do { - uint8x8x3_t vxyz; - vxyz.val[0] = vld1_u8(x); x += 8; - vxyz.val[1] = vld1_u8(y); y += 8; - vxyz.val[2] = vld1_u8(z); z += 8; - vst3_u8(o, vxyz); o += 24; - n -= 8; - } while (n >= 8); - if (n != 0) { - const size_t address_increment = n - 8; - uint8x8x3_t vxyz; - vxyz.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment)); - vxyz.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment)); - vxyz.val[2] = vld1_u8((const uint8_t*) ((uintptr_t) z + address_increment)); - vst3_u8((uint8_t*) ((uintptr_t) o + address_increment * 3), vxyz); - } - } else { - do { - const uint8_t vx = *x++; - const uint8_t vy = *y++; - const uint8_t vz = *z++; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o += 3; - } while (--n != 0); - } -} diff --git a/src/x8-zip/x8-zip-x3-scalar.c b/src/x8-zip/x8-zip-x3-scalar.c deleted file mode 100644 index a5768086de6..00000000000 --- a/src/x8-zip/x8-zip-x3-scalar.c +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_x3_ukernel__scalar( - size_t n, - const uint8_t* input, - uint8_t* output) -{ - const uint8_t* x = input; - const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); - const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); - uint8_t* o = output; - - do { - const uint8_t vx = *x++; - const uint8_t vy = *y++; - const uint8_t vz = *z++; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o += 3; - - n -= sizeof(uint8_t); - } while (n != 0); -} diff --git a/src/x8-zip/x8-zip-x3-sse2.c b/src/x8-zip/x8-zip-x3-sse2.c deleted file mode 100644 index 4ac5cfd76ee..00000000000 --- a/src/x8-zip/x8-zip-x3-sse2.c +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_x3_ukernel__sse2( - size_t n, - const uint8_t* input, - uint8_t* output) -{ - const uint8_t* x = input; - const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); - const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); - uint8_t* o = output; - - if (n >= 16) { - const __m128i vmask0x00FF00FF = _mm_set1_epi16(0x00FF); - const __m128i vmask0x0000FFFF = _mm_set1_epi32(0x0000FFFF); - do { - // vx = ( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ) - const __m128i vx = _mm_loadu_si128((const __m128i*) x); - x += 16; - // vy = ( y15, y14, y13, y12, y11, y10, y9, y8, y7, y6, y5, y4, y3, y2, y1, y0 ) - const __m128i vy = _mm_loadu_si128((const __m128i*) y); - y += 16; - // vz = ( z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0 ) - const __m128i vz = _mm_loadu_si128((const __m128i*) z); - z += 16; - - // vxeye = ( y14, x14, y12, x12, y10, x10, y8, x8, y6, x6, y4, x4, y2, x2, y0, x0 ) - const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8)); - // vyozo = ( z15, y15, z13, y13, z11, y11, z9, y9, z7, y7, z5, y5, z3, y3, z1, y1 ) - const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8)); - // vzoxo = ( x15, z14, x13, z12, x11, z10, x9, z8, x7, z6, x5, z4, x3, z2, x1, z0 ) - const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx)); - - // vxeyezexo = ( x13, z12, y12, x12, x9, z8, y8, x8, x5, z4, y4, x4, x1, z0, y0, x0 ) - const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16)); - // vyozoxeye = ( y14, x14, z13, y13, y10, x10, z9, y9, y6, x6, z5, y5, y2, x2, z1, y1 ) - const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye)); - // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10, z7, y7, x7, z6, z3, y3, x3, z2 ) - const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16)); - - // vtemp0 = ( x13, z12, y12, x12, x5, z4, y4, x4, z11, y11, x11, z10, z3, y3, x3, z2 ) - const __m128i vtemp0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0))); - // vtemp1 = ( y10, x10, z9, y9, y2, x2, z1, y1, x9, z8, y8, x8, x1, z0, y0, x0 ) - const __m128i vtemp1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0))); - // vtemp2 = ( z15, y15, x15, z14, z7, y7, x7, z6, y14, x14, z13, y13, y6, x6, z5, y5 ) - const __m128i vtemp2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1))); - - // vxyz0 = ( x5, z4, y4, x4, z3, y3, x3, z2, y2, x2, z1, y1, x1, z0, y0, x0 ) - const __m128i vxyz0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0))); - // vxyz1 = ( y10, x10, z9, y9, x9, z8, y8, x8, z7, y7, x7, z6, y6, x6, z5, y5 ) - const __m128i vxyz1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0))); - // vxyz2 = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 ) - const __m128i vxyz2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1))); - - _mm_storeu_si128((__m128i*) o, vxyz0); - _mm_storeu_si128((__m128i*) o + 1, vxyz1); - _mm_storeu_si128((__m128i*) o + 2, vxyz2); - o += 48; - n -= 16; - } while (n >= 16); - if (n != 0) { - const size_t address_increment = n - 16; - // vx = ( x15, x14, x13, x12, x11, x10, x9, x8, x7, x6, x5, x4, x3, x2, x1, x0 ) - const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment)); - // vy = ( y15, y14, y13, y12, y11, y10, y9, y8, y7, y6, y5, y4, y3, y2, y1, y0 ) - const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment)); - // vz = ( z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0 ) - const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment)); - - // vxeye = ( y14, x14, y12, x12, y10, x10, y8, x8, y6, x6, y4, x4, y2, x2, y0, x0 ) - const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8)); - // vyozo = ( z15, y15, z13, y13, z11, y11, z9, y9, z7, y7, z5, y5, z3, y3, z1, y1 ) - const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8)); - // vzoxo = ( x15, z14, x13, z12, x11, z10, x9, z8, x7, z6, x5, z4, x3, z2, x1, z0 ) - const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx)); - - // vxeyezexo = ( x13, z12, y12, x12, x9, z8, y8, x8, x5, z4, y4, x4, x1, z0, y0, x0 ) - const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16)); - // vyozoxeye = ( y14, x14, z13, y13, y10, x10, z9, y9, y6, x6, z5, y5, y2, x2, z1, y1 ) - const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye)); - // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10, z7, y7, x7, z6, z3, y3, x3, z2 ) - const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16)); - - // vtemp0 = ( x13, z12, y12, x12, x5, z4, y4, x4, z11, y11, x11, z10, z3, y3, x3, z2 ) - const __m128i vtemp0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0))); - // vtemp1 = ( y10, x10, z9, y9, y2, x2, z1, y1, x9, z8, y8, x8, x1, z0, y0, x0 ) - const __m128i vtemp1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0))); - // vtemp2 = ( z15, y15, x15, z14, z7, y7, x7, z6, y14, x14, z13, y13, y6, x6, z5, y5 ) - const __m128i vtemp2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1))); - - // vxyz0 = ( x5, z4, y4, x4, z3, y3, x3, z2, y2, x2, z1, y1, x1, z0, y0, x0 ) - const __m128i vxyz0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0))); - // vxyz1 = ( y10, x10, z9, y9, x9, z8, y8, x8, z7, y7, x7, z6, y6, x6, z5, y5 ) - const __m128i vxyz1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0))); - // vxyz2 = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 ) - const __m128i vxyz2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1))); - - o = (uint8_t*) ((uintptr_t) o + address_increment * 3); - _mm_storeu_si128((__m128i*) o, vxyz0); - _mm_storeu_si128((__m128i*) o + 1, vxyz1); - _mm_storeu_si128((__m128i*) o + 2, vxyz2); - } - } else { - do { - const uint8_t vx = *x++; - const uint8_t vy = *y++; - const uint8_t vz = *z++; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o += 3; - } while (--n != 0); - } -} diff --git a/src/x8-zip/x8-zip-x4-neon.c b/src/x8-zip/x8-zip-x4-neon.c deleted file mode 100644 index 158b325f653..00000000000 --- a/src/x8-zip/x8-zip-x4-neon.c +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_x4_ukernel__neon( - size_t n, - const uint8_t* input, - uint8_t* output) -{ - const uint8_t* x = input; - const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); - const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); - const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n); - uint8_t* o = output; - - if (n >= 8) { - do { - uint8x8x4_t vxyzw; - vxyzw.val[0] = vld1_u8(x); x += 8; - vxyzw.val[1] = vld1_u8(y); y += 8; - vxyzw.val[2] = vld1_u8(z); z += 8; - vxyzw.val[3] = vld1_u8(w); w += 8; - vst4_u8(o, vxyzw); o += 32; - n -= 8; - } while (n >= 8); - if (n != 0) { - const size_t address_increment = n - 8; - uint8x8x4_t vxyzw; - vxyzw.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment)); - vxyzw.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment)); - vxyzw.val[2] = vld1_u8((const uint8_t*) ((uintptr_t) z + address_increment)); - vxyzw.val[3] = vld1_u8((const uint8_t*) ((uintptr_t) w + address_increment)); - vst4_u8((uint8_t*) ((uintptr_t) o + address_increment * 4), vxyzw); - } - } else { - do { - const uint8_t vx = *x++; - const uint8_t vy = *y++; - const uint8_t vz = *z++; - const uint8_t vw = *w++; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o[3] = vw; - o += 4; - } while (--n != 0); - } -} diff --git a/src/x8-zip/x8-zip-x4-scalar.c b/src/x8-zip/x8-zip-x4-scalar.c deleted file mode 100644 index bfce3071ab3..00000000000 --- a/src/x8-zip/x8-zip-x4-scalar.c +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_x4_ukernel__scalar( - size_t n, - const uint8_t* input, - uint8_t* output) -{ - assert(n != 0); - - const uint8_t* x = input; - const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); - const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); - const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n); - uint8_t* o = output; - - do { - const uint8_t vx = *x++; - const uint8_t vy = *y++; - const uint8_t vz = *z++; - const uint8_t vw = *w++; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o[3] = vw; - o += 4; - - n -= sizeof(uint8_t); - } while (n != 0); -} diff --git a/src/x8-zip/x8-zip-x4-sse2.c b/src/x8-zip/x8-zip-x4-sse2.c deleted file mode 100644 index b00dfc8901e..00000000000 --- a/src/x8-zip/x8-zip-x4-sse2.c +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_x4_ukernel__sse2( - size_t n, - const uint8_t* input, - uint8_t* output) -{ - const uint8_t* x = input; - const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n); - const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n); - const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n); - uint8_t* o = output; - - if (n >= 16) { - do { - const __m128i vx = _mm_loadu_si128((const __m128i*) x); - x += 16; - const __m128i vy = _mm_loadu_si128((const __m128i*) y); - y += 16; - const __m128i vz = _mm_loadu_si128((const __m128i*) z); - z += 16; - const __m128i vw = _mm_loadu_si128((const __m128i*) w); - w += 16; - const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy); - const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy); - const __m128i vzw_lo = _mm_unpacklo_epi8(vz, vw); - const __m128i vzw_hi = _mm_unpackhi_epi8(vz, vw); - const __m128i vxyzw0 = _mm_unpacklo_epi16(vxy_lo, vzw_lo); - const __m128i vxyzw1 = _mm_unpackhi_epi16(vxy_lo, vzw_lo); - const __m128i vxyzw2 = _mm_unpacklo_epi16(vxy_hi, vzw_hi); - const __m128i vxyzw3 = _mm_unpackhi_epi16(vxy_hi, vzw_hi); - _mm_storeu_si128((__m128i*) o, vxyzw0); - _mm_storeu_si128((__m128i*) o + 1, vxyzw1); - _mm_storeu_si128((__m128i*) o + 2, vxyzw2); - _mm_storeu_si128((__m128i*) o + 3, vxyzw3); - o = (void*) ((uintptr_t) o + 64); - n -= 16; - } while (n >= 16); - if (n != 0) { - const size_t address_increment = n - 16; - const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment)); - const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment)); - const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment)); - const __m128i vw = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + address_increment)); - const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy); - const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy); - const __m128i vzw_lo = _mm_unpacklo_epi8(vz, vw); - const __m128i vzw_hi = _mm_unpackhi_epi8(vz, vw); - const __m128i vxyzw0 = _mm_unpacklo_epi16(vxy_lo, vzw_lo); - const __m128i vxyzw1 = _mm_unpackhi_epi16(vxy_lo, vzw_lo); - const __m128i vxyzw2 = _mm_unpacklo_epi16(vxy_hi, vzw_hi); - const __m128i vxyzw3 = _mm_unpackhi_epi16(vxy_hi, vzw_hi); - o = (void*) ((uintptr_t) o + address_increment * 4); - _mm_storeu_si128((__m128i*) o, vxyzw0); - _mm_storeu_si128((__m128i*) o + 1, vxyzw1); - _mm_storeu_si128((__m128i*) o + 2, vxyzw2); - _mm_storeu_si128((__m128i*) o + 3, vxyzw3); - } - } else { - do { - const uint8_t vx = *x++; - const uint8_t vy = *y++; - const uint8_t vz = *z++; - const uint8_t vw = *w++; - o[0] = vx; - o[1] = vy; - o[2] = vz; - o[3] = vw; - o += 4; - } while (--n != 0); - } -} diff --git a/src/x8-zip/x8-zip-xm-neon.c b/src/x8-zip/x8-zip-xm-neon.c deleted file mode 100644 index 7839d0826d8..00000000000 --- a/src/x8-zip/x8-zip-xm-neon.c +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_xm_ukernel__neon( - size_t n, - size_t m, - const uint8_t* input, - uint8_t* output) -{ - const uint8_t* w = input; - const size_t input_increment = n * 3; - const size_t output_increment = 4 - m * n; - const uint8_t* last_input = w + n * (m - 1); - uint8_t* last_output = (uint8_t*) ((uintptr_t) output + (m - 4)); - - if (n >= 8) { - for (size_t i = 0; i < m; i += 4) { - size_t k = n; - w = (const uint8_t*) ((uintptr_t) w + input_increment); - if (w >= last_input) { - w = last_input; - } - const uint8_t* z = (const uint8_t*) ((uintptr_t) w - n); - const uint8_t* y = (const uint8_t*) ((uintptr_t) z - n); - const uint8_t* x = (const uint8_t*) ((uintptr_t) y - n); - while (k >= 8) { - const uint8x8_t vx = vld1_u8(x); x += 8; - const uint8x8_t vy = vld1_u8(y); y += 8; - const uint8x8_t vz = vld1_u8(z); z += 8; - const uint8x8_t vw = vld1_u8(w); w += 8; - - const uint8x8x2_t vxy = vzip_u8(vx, vy); - const uint8x8x2_t vzw = vzip_u8(vz, vw); - const uint16x4x2_t vxyzw_lo = vzip_u16(vreinterpret_u16_u8(vxy.val[0]), vreinterpret_u16_u8(vzw.val[0])); - const uint16x4x2_t vxyzw_hi = vzip_u16(vreinterpret_u16_u8(vxy.val[1]), vreinterpret_u16_u8(vzw.val[1])); - - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[0]), 0); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[0]), 1); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[1]), 0); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[1]), 1); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[0]), 0); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[0]), 1); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[1]), 0); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[1]), 1); - output = (uint8_t*) ((uintptr_t) output + m); - - k -= 8; - } - if (k != 0) { - const size_t address_increment = k - 8; - x = (const uint8_t*) ((uintptr_t) x + address_increment); - y = (const uint8_t*) ((uintptr_t) y + address_increment); - z = (const uint8_t*) ((uintptr_t) z + address_increment); - w = (const uint8_t*) ((uintptr_t) w + address_increment); - const int64x1_t vshift = vmov_n_s64(8 * address_increment); - - const uint64x1_t vx = vshl_u64(vreinterpret_u64_u8(vld1_u8(x)), vshift); - const uint64x1_t vy = vshl_u64(vreinterpret_u64_u8(vld1_u8(y)), vshift); - const uint64x1_t vz = vshl_u64(vreinterpret_u64_u8(vld1_u8(z)), vshift); - const uint64x1_t vw = vshl_u64(vreinterpret_u64_u8(vld1_u8(w)), vshift); w += 8; - const uint8x8x2_t vxy = vzip_u8(vreinterpret_u8_u64(vx), vreinterpret_u8_u64(vy)); - const uint8x8x2_t vzw = vzip_u8(vreinterpret_u8_u64(vz), vreinterpret_u8_u64(vw)); - const uint16x4x2_t vxyzw_lo = vzip_u16(vreinterpret_u16_u8(vxy.val[0]), vreinterpret_u16_u8(vzw.val[0])); - const uint16x4x2_t vxyzw_hi = vzip_u16(vreinterpret_u16_u8(vxy.val[1]), vreinterpret_u16_u8(vzw.val[1])); - - uint32x2_t vxyzw0 = vreinterpret_u32_u16(vxyzw_lo.val[0]); - uint32x2_t vxyzw1 = vreinterpret_u32_u16(vxyzw_lo.val[1]); - uint32x2_t vxyzw2 = vreinterpret_u32_u16(vxyzw_hi.val[0]); - uint32x2_t vxyzw3 = vreinterpret_u32_u16(vxyzw_hi.val[1]); - - if (k & 4) { - vst1_lane_u32((void*) output, vxyzw0, 0); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vxyzw0, 1); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vxyzw1, 0); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vxyzw1, 1); - output = (uint8_t*) ((uintptr_t) output + m); - - vxyzw0 = vxyzw2; - vxyzw1 = vxyzw3; - } - - if (k & 2) { - vst1_lane_u32((void*) output, vxyzw0, 0); - output = (uint8_t*) ((uintptr_t) output + m); - - vst1_lane_u32((void*) output, vxyzw0, 1); - output = (uint8_t*) ((uintptr_t) output + m); - - vxyzw0 = vxyzw1; - } - if (k & 1) { - vst1_lane_u32((void*) output, vxyzw0, 0); - output = (uint8_t*) ((uintptr_t) output + m); - } - } - output = (uint8_t*) ((uintptr_t) output + output_increment); - if (output > last_output) { - output = last_output; - } - } - } else { - const uint8_t* i = input; - uint8_t* o = output; - size_t k = n; - do { - size_t l = m; - const uint8_t* ii = i++; - do { - *o++ = *ii; - ii += n; - } while (--l != 0); - } while (--k != 0); - } -} diff --git a/src/x8-zip/x8-zip-xm-scalar.c b/src/x8-zip/x8-zip-xm-scalar.c deleted file mode 100644 index 4d2a4e55333..00000000000 --- a/src/x8-zip/x8-zip-xm-scalar.c +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" - - -void xnn_x8_zip_xm_ukernel__scalar( - size_t n, - size_t m, - const uint8_t* input, - uint8_t* output) -{ - assert(n != 0); - assert(m >= 4); - - size_t k = n; - do { - size_t l = m; - const uint8_t* input_column = input++; - do { - *output++ = *input_column; - input_column = (uint8_t*) ((uintptr_t) input_column + n); - } while (--l != 0); - k -= sizeof(uint8_t); - } while (k != 0); -} diff --git a/src/x8-zip/x8-zip-xm-sse2.c b/src/x8-zip/x8-zip-xm-sse2.c deleted file mode 100644 index 1309639dd59..00000000000 --- a/src/x8-zip/x8-zip-xm-sse2.c +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include "xnnpack/zip.h" -#include "xnnpack/unaligned.h" - - -void xnn_x8_zip_xm_ukernel__sse2( - size_t n, - size_t m, - const uint8_t* input, - uint8_t* output) -{ - const uint8_t* w = input; - const size_t input_increment = n * 3; - const size_t output_increment = 4 - m * n; - const uint8_t* last_input = w + n * (m - 1); - uint8_t* last_output = (uint8_t*) ((uintptr_t) output + (m - 4)); - - if (n >= 8) { - for (size_t i = 0; i < m; i += 4) { - size_t k = n; - w = (const uint8_t*) ((uintptr_t) w + input_increment); - if (w >= last_input) { - w = last_input; - } - const uint8_t* z = (const uint8_t*) ((uintptr_t) w - n); - const uint8_t* y = (const uint8_t*) ((uintptr_t) z - n); - const uint8_t* x = (const uint8_t*) ((uintptr_t) y - n); - while (k >= 16) { - const __m128i vx = _mm_loadu_si128((const __m128i*) x); - x += 16; - const __m128i vy = _mm_loadu_si128((const __m128i*) y); - y += 16; - const __m128i vz = _mm_loadu_si128((const __m128i*) z); - z += 16; - const __m128i vw = _mm_loadu_si128((const __m128i*) w); - w += 16; - const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy); - const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy); - const __m128i vzw_lo = _mm_unpacklo_epi8(vz, vw); - const __m128i vzw_hi = _mm_unpackhi_epi8(vz, vw); - __m128i vxyzw0 = _mm_unpacklo_epi16(vxy_lo, vzw_lo); - __m128i vxyzw1 = _mm_unpackhi_epi16(vxy_lo, vzw_lo); - __m128i vxyzw2 = _mm_unpacklo_epi16(vxy_hi, vzw_hi); - __m128i vxyzw3 = _mm_unpackhi_epi16(vxy_hi, vzw_hi); - - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw1 = _mm_unpackhi_epi64(vxyzw1, vxyzw1); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1)); - output = (uint8_t*) ((uintptr_t) output + m); - - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw2 = _mm_shufflelo_epi16(vxyzw2, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw2 = _mm_unpackhi_epi64(vxyzw2, vxyzw2); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw2 = _mm_shufflelo_epi16(vxyzw2, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2)); - output = (uint8_t*) ((uintptr_t) output + m); - - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw3 = _mm_shufflelo_epi16(vxyzw3, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw3 = _mm_unpackhi_epi64(vxyzw3, vxyzw3); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw3 = _mm_shufflelo_epi16(vxyzw3, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3)); - output = (uint8_t*) ((uintptr_t) output + m); - k -= 16; - }; - if (k >= 8) { - const __m128i vx = _mm_loadl_epi64((const __m128i*) x); - x += 8; - const __m128i vy = _mm_loadl_epi64((const __m128i*) y); - y += 8; - const __m128i vz = _mm_loadl_epi64((const __m128i*) z); - z += 8; - const __m128i vw = _mm_loadl_epi64((const __m128i*) w); - w += 8; - const __m128i vxy = _mm_unpacklo_epi8(vx, vy); - const __m128i vzw = _mm_unpacklo_epi8(vz, vw); - __m128i vxyzw0 = _mm_unpacklo_epi16(vxy, vzw); - __m128i vxyzw1 = _mm_unpackhi_epi16(vxy, vzw); - - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw1 = _mm_unpackhi_epi64(vxyzw1, vxyzw1); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1)); - output = (uint8_t*) ((uintptr_t) output + m); - k -= 8; - } - if (k != 0) { - const size_t address_decrement = 8 - k; - x -= address_decrement; - y -= address_decrement; - z -= address_decrement; - w -= address_decrement; - const __m128i vshift = _mm_cvtsi32_si128((int) address_decrement * 8); - - const __m128i vx = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) x), vshift); - const __m128i vy = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) y), vshift); - const __m128i vz = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) z), vshift); - const __m128i vw = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) w), vshift); - w += 8; - const __m128i vxy = _mm_unpacklo_epi8(vx, vy); - const __m128i vzw = _mm_unpacklo_epi8(vz, vw); - __m128i vxyzw0 = _mm_unpacklo_epi16(vxy, vzw); - __m128i vxyzw1 = _mm_unpackhi_epi16(vxy, vzw); - - if (k & 4) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = vxyzw1; - } - - if (k & 2) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2)); - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0); - } - if (k & 1) { - unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0)); - output = (uint8_t*) ((uintptr_t) output + m); - } - } - output = (uint8_t*) ((uintptr_t) output + output_increment); - if (output > last_output) { - output = last_output; - } - } - } else { - const uint8_t* i = input; - uint8_t* o = output; - size_t k = n; - do { - size_t l = m; - const uint8_t* ii = i++; - do { - *o++ = *ii; - ii += n; - } while (--l != 0); - } while (--k != 0); - } -} diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h index e5f819db0b0..7ee797f9c5e 100644 --- a/src/xnnpack/compute.h +++ b/src/xnnpack/compute.h @@ -1198,29 +1198,6 @@ struct elementwise_binary_context { size_t i, size_t j, size_t k, size_t l, size_t m); #endif -struct channel_shuffle_context { - const void* x; - size_t x_stride; - void* y; - size_t y_stride; - size_t n; - size_t m; - union { - xnn_zipc_ukernel_fn fixed_ukernel; - xnn_zipv_ukernel_fn variable_ukernel; - }; -}; - -#ifndef __cplusplus - XNN_PRIVATE void xnn_compute_channel_shuffle_fixed( - const struct channel_shuffle_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t index); - - XNN_PRIVATE void xnn_compute_channel_shuffle_variable( - const struct channel_shuffle_context context[restrict XNN_MIN_ELEMENTS(1)], - size_t index); -#endif - struct lut_strided_context { size_t n; const void* x; diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h index 3ff8c61441a..0fd3d1bf933 100644 --- a/src/xnnpack/config.h +++ b/src/xnnpack/config.h @@ -267,9 +267,6 @@ XNN_INTERNAL const struct xnn_maxpool_config* xnn_init_f32_maxpool_config(); XNN_INTERNAL const struct xnn_maxpool_config* xnn_init_s8_maxpool_config(); XNN_INTERNAL const struct xnn_maxpool_config* xnn_init_u8_maxpool_config(); -XNN_INTERNAL const struct xnn_zip_config* xnn_init_x8_zip_config(); -XNN_INTERNAL const struct xnn_zip_config* xnn_init_x32_zip_config(); - XNN_INTERNAL const struct xnn_rmax_config* xnn_init_f16_rmax_config(); XNN_INTERNAL const struct xnn_rmax_config* xnn_init_f32_rmax_config(); XNN_INTERNAL const struct xnn_rmax_config* xnn_init_u8_rmax_config(); diff --git a/src/xnnpack/operator-type-defs.h b/src/xnnpack/operator-type-defs.h index 623ef6da653..96aecb7bfc1 100644 --- a/src/xnnpack/operator-type-defs.h +++ b/src/xnnpack/operator-type-defs.h @@ -18,8 +18,6 @@ XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_f32, "Batch Matrix Mult XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_qd8_f32_qc8w, "Batch Matrix Multiply (NC, QD8, F32, QC8W)") XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_qdu8_f32_qc8w, "Batch Matrix Multiply (NC, QDU8, F32, QC8W)") XNN_ENUM_ITEM(xnn_operator_type_binary_elementwise, "Binary Elementwise (ND)") -XNN_ENUM_ITEM(xnn_operator_type_channel_shuffle_nc_x8, "Channel Shuffle (NC, X8)") -XNN_ENUM_ITEM(xnn_operator_type_channel_shuffle_nc_x32, "Channel Shuffle (NC, X32)") XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x8, "Constant Pad (ND, X8)") XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x16, "Constant Pad (ND, X16)") XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x32, "Constant Pad (ND, X32)") diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h index 6d88baa4919..c43a5107a9e 100644 --- a/src/xnnpack/operator.h +++ b/src/xnnpack/operator.h @@ -331,7 +331,6 @@ struct xnn_operator { union { struct argmax_pooling_context argmax_pooling; struct average_pooling_context average_pooling; - struct channel_shuffle_context channel_shuffle; struct conv2d_context conv2d; struct dwconv2d_context dwconv2d; struct { diff --git a/test/BUILD.bazel b/test/BUILD.bazel index d67eaa6dc01..c1f21d2fa8b 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -1084,15 +1084,6 @@ xnnpack_unit_test( deps = MICROKERNEL_TEST_DEPS, ) -xnnpack_unit_test( - name = "x8_zip_test", - srcs = [ - "x8-zip.cc", - "zip-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "x32_packb_test", srcs = [ @@ -1151,15 +1142,6 @@ xnnpack_unit_test( deps = MICROKERNEL_TEST_DEPS, ) -xnnpack_unit_test( - name = "x32_zip_test", - srcs = [ - "x32-zip.cc", - "zip-microkernel-tester.h", - ], - deps = MICROKERNEL_TEST_DEPS, -) - xnnpack_unit_test( name = "xx_fill_test", srcs = ["xx-fill.cc"], @@ -1237,15 +1219,6 @@ xnnpack_unit_test( deps = OPERATOR_TEST_DEPS, ) -xnnpack_unit_test( - name = "channel_shuffle_nc_test", - srcs = [ - "channel-shuffle-nc.cc", - "channel-shuffle-operator-tester.h", - ], - deps = OPERATOR_TEST_DEPS, -) - xnnpack_unit_test( name = "constant_pad_nd_test", srcs = [ diff --git a/test/channel-shuffle-nc.cc b/test/channel-shuffle-nc.cc deleted file mode 100644 index b84e9f993d4..00000000000 --- a/test/channel-shuffle-nc.cc +++ /dev/null @@ -1,504 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#include - -#include -#include "channel-shuffle-operator-tester.h" - -TEST(CHANNEL_SHUFFLE_NC_X8, two_groups_unit_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(1) - .groups(2) - .group_channels(group_channels) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, three_groups_unit_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(1) - .groups(3) - .group_channels(group_channels) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, four_groups_unit_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(1) - .groups(4) - .group_channels(group_channels) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, many_groups_unit_batch) { - for (size_t groups = 5; groups < 12; groups += 3) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(1) - .groups(groups) - .group_channels(group_channels) - .iterations(3) - .TestX8(); - } - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, two_groups_small_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(2) - .group_channels(group_channels) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, three_groups_small_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(3) - .group_channels(group_channels) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, four_groups_small_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(4) - .group_channels(group_channels) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, many_groups_small_batch) { - for (size_t groups = 5; groups < 12; groups += 3) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(groups) - .group_channels(group_channels) - .iterations(3) - .TestX8(); - } - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, two_groups_small_batch_with_input_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(2) - .group_channels(group_channels) - .input_stride(511) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, three_groups_small_batch_with_input_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(3) - .group_channels(group_channels) - .input_stride(511) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, four_groups_small_batch_with_input_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(4) - .group_channels(group_channels) - .input_stride(511) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, many_groups_small_batch_with_input_stride) { - for (size_t groups = 5; groups < 12; groups += 3) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(groups) - .group_channels(group_channels) - .input_stride(1007) - .iterations(3) - .TestX8(); - } - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, two_groups_small_batch_with_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(2) - .group_channels(group_channels) - .output_stride(513) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, three_groups_small_batch_with_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(3) - .group_channels(group_channels) - .output_stride(513) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, four_groups_small_batch_with_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(4) - .group_channels(group_channels) - .output_stride(513) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, many_groups_small_batch_with_output_stride) { - for (size_t groups = 5; groups < 12; groups += 3) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(groups) - .group_channels(group_channels) - .output_stride(1111) - .iterations(3) - .TestX8(); - } - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, two_groups_small_batch_with_input_and_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(2) - .group_channels(group_channels) - .input_stride(511) - .output_stride(513) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, three_groups_small_batch_with_input_and_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(3) - .group_channels(group_channels) - .input_stride(511) - .output_stride(513) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, four_groups_small_batch_with_input_and_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(4) - .group_channels(group_channels) - .input_stride(511) - .output_stride(513) - .iterations(3) - .TestX8(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X8, many_groups_small_batch_with_input_and_output_stride) { - for (size_t groups = 5; groups < 12; groups += 3) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(groups) - .group_channels(group_channels) - .input_stride(1007) - .output_stride(1111) - .iterations(3) - .TestX8(); - } - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, two_groups_unit_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(1) - .groups(2) - .group_channels(group_channels) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, three_groups_unit_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(1) - .groups(3) - .group_channels(group_channels) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, four_groups_unit_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(1) - .groups(4) - .group_channels(group_channels) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, many_groups_unit_batch) { - for (size_t groups = 5; groups < 12; groups += 3) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(1) - .groups(groups) - .group_channels(group_channels) - .iterations(3) - .TestX32(); - } - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, two_groups_small_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(2) - .group_channels(group_channels) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, three_groups_small_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(3) - .group_channels(group_channels) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, four_groups_small_batch) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(4) - .group_channels(group_channels) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, many_groups_small_batch) { - for (size_t groups = 5; groups < 12; groups += 3) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(groups) - .group_channels(group_channels) - .iterations(3) - .TestX32(); - } - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, two_groups_small_batch_with_input_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(2) - .group_channels(group_channels) - .input_stride(511) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, three_groups_small_batch_with_input_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(3) - .group_channels(group_channels) - .input_stride(511) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, four_groups_small_batch_with_input_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(4) - .group_channels(group_channels) - .input_stride(511) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, many_groups_small_batch_with_input_stride) { - for (size_t groups = 5; groups < 12; groups += 3) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(groups) - .group_channels(group_channels) - .input_stride(1007) - .iterations(3) - .TestX32(); - } - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, two_groups_small_batch_with_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(2) - .group_channels(group_channels) - .output_stride(513) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, three_groups_small_batch_with_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(3) - .group_channels(group_channels) - .output_stride(513) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, four_groups_small_batch_with_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(4) - .group_channels(group_channels) - .output_stride(513) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, many_groups_small_batch_with_output_stride) { - for (size_t groups = 5; groups < 12; groups += 3) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(groups) - .group_channels(group_channels) - .output_stride(1111) - .iterations(3) - .TestX32(); - } - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, two_groups_small_batch_with_input_and_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(2) - .group_channels(group_channels) - .input_stride(511) - .output_stride(513) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, three_groups_small_batch_with_input_and_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(3) - .group_channels(group_channels) - .input_stride(511) - .output_stride(513) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, four_groups_small_batch_with_input_and_output_stride) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(4) - .group_channels(group_channels) - .input_stride(511) - .output_stride(513) - .iterations(3) - .TestX32(); - } -} - -TEST(CHANNEL_SHUFFLE_NC_X32, many_groups_small_batch_with_input_and_output_stride) { - for (size_t groups = 5; groups < 12; groups += 3) { - for (size_t group_channels = 1; group_channels < 100; group_channels += 15) { - ChannelShuffleOperatorTester() - .batch_size(3) - .groups(groups) - .group_channels(group_channels) - .input_stride(1007) - .output_stride(1111) - .iterations(3) - .TestX32(); - } - } -} diff --git a/test/channel-shuffle-operator-tester.h b/test/channel-shuffle-operator-tester.h deleted file mode 100644 index e3ac7117df8..00000000000 --- a/test/channel-shuffle-operator-tester.h +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. -// All rights reserved. -// -// Copyright 2019 Google LLC -// -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "xnnpack.h" -#include "xnnpack/buffer.h" -#include "replicable_random_device.h" - -class ChannelShuffleOperatorTester { - public: - ChannelShuffleOperatorTester& groups(size_t groups) { - assert(groups != 0); - this->groups_ = groups; - return *this; - } - - size_t groups() const { - return this->groups_; - } - - ChannelShuffleOperatorTester& group_channels(size_t group_channels) { - assert(group_channels != 0); - this->group_channels_ = group_channels; - return *this; - } - - size_t group_channels() const { - return this->group_channels_; - } - - size_t channels() const { - return groups() * group_channels(); - } - - ChannelShuffleOperatorTester& input_stride(size_t input_stride) { - assert(input_stride != 0); - this->input_stride_ = input_stride; - return *this; - } - - size_t input_stride() const { - if (this->input_stride_ == 0) { - return channels(); - } else { - assert(this->input_stride_ >= channels()); - return this->input_stride_; - } - } - - ChannelShuffleOperatorTester& output_stride(size_t output_stride) { - assert(output_stride != 0); - this->output_stride_ = output_stride; - return *this; - } - - size_t output_stride() const { - if (this->output_stride_ == 0) { - return channels(); - } else { - assert(this->output_stride_ >= channels()); - return this->output_stride_; - } - } - - ChannelShuffleOperatorTester& batch_size(size_t batch_size) { - assert(batch_size != 0); - this->batch_size_ = batch_size; - return *this; - } - - size_t batch_size() const { - return this->batch_size_; - } - - ChannelShuffleOperatorTester& iterations(size_t iterations) { - this->iterations_ = iterations; - return *this; - } - - size_t iterations() const { - return this->iterations_; - } - - void TestX8() const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u8dist( - std::numeric_limits::min(), std::numeric_limits::max()); - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(uint8_t) + (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); }); - - // Create, setup, run, and destroy Channel Shuffle operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t channel_shuffle_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_channel_shuffle_nc_x8( - groups(), group_channels(), - input_stride(), output_stride(), - 0, &channel_shuffle_op)); - ASSERT_NE(nullptr, channel_shuffle_op); - - // Smart pointer to automatically delete channel_shuffle_op. - std::unique_ptr auto_channel_shuffle_op(channel_shuffle_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, - xnn_reshape_channel_shuffle_nc_x8( - channel_shuffle_op, - batch_size(), - /*threadpool=*/nullptr)); - - ASSERT_EQ(xnn_status_success, - xnn_setup_channel_shuffle_nc_x8( - channel_shuffle_op, - input.data(), output.data())); - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(channel_shuffle_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t g = 0; g < groups(); g++) { - for (size_t c = 0; c < group_channels(); c++) { - ASSERT_EQ(int32_t(input[i * input_stride() + g * group_channels() + c]), - int32_t(output[i * output_stride() + c * groups() + g])) - << "batch index " << i << ", group " << g << ", channel " << c; - } - } - } - } - } - - void TestX32() const { - xnnpack::ReplicableRandomDevice rng; - std::uniform_int_distribution u32dist; - - xnnpack::Buffer input(XNN_EXTRA_BYTES / sizeof(uint32_t) + (batch_size() - 1) * input_stride() + channels()); - xnnpack::Buffer output((batch_size() - 1) * output_stride() + channels()); - for (size_t iteration = 0; iteration < iterations(); iteration++) { - std::generate(input.begin(), input.end(), [&]() { return u32dist(rng); }); - - // Create, setup, run, and destroy Channel Shuffle operator. - ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */)); - xnn_operator_t channel_shuffle_op = nullptr; - - ASSERT_EQ(xnn_status_success, - xnn_create_channel_shuffle_nc_x32( - groups(), group_channels(), - input_stride(), output_stride(), - 0, &channel_shuffle_op)); - ASSERT_NE(nullptr, channel_shuffle_op); - - // Smart pointer to automatically delete channel_shuffle_op. - std::unique_ptr auto_channel_shuffle_op(channel_shuffle_op, xnn_delete_operator); - - ASSERT_EQ(xnn_status_success, - xnn_reshape_channel_shuffle_nc_x32( - channel_shuffle_op, - batch_size(), - /*threadpool=*/nullptr)); - - ASSERT_EQ(xnn_status_success, - xnn_setup_channel_shuffle_nc_x32( - channel_shuffle_op, - input.data(), output.data())); - - ASSERT_EQ(xnn_status_success, - xnn_run_operator(channel_shuffle_op, /*threadpool=*/nullptr)); - - // Verify results. - for (size_t i = 0; i < batch_size(); i++) { - for (size_t g = 0; g < groups(); g++) { - for (size_t c = 0; c < group_channels(); c++) { - ASSERT_EQ(input[i * input_stride() + g * group_channels() + c], - output[i * output_stride() + c * groups() + g]) - << "batch index " << i << ", group " << g << ", channel " << c; - } - } - } - } - } - - private: - size_t groups_{1}; - size_t group_channels_{1}; - size_t batch_size_{1}; - size_t input_stride_{0}; - size_t output_stride_{0}; - size_t iterations_{15}; -};