diff --git a/CMakeLists.txt b/CMakeLists.txt
index c94020d7e74..1877c8dec45 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -430,7 +430,6 @@ SET(OPERATOR_SRCS
   src/operators/average-pooling-nhwc.c
   src/operators/batch-matrix-multiply-nc.c
   src/operators/binary-elementwise-nd.c
-  src/operators/channel-shuffle-nc.c
   src/operators/constant-pad-nd.c
   src/operators/convolution-nchw.c
   src/operators/convolution-nhwc.c
@@ -523,7 +522,6 @@ SET(XNNPACK_SRCS
   src/configs/xx-fill-config.c
   src/configs/xx-pad-config.c
   src/configs/x8-lut-config.c
-  src/configs/zip-config.c
   src/init.c
   src/params.c
   "${PROJECT_BINARY_DIR}/build_identifier.c")
@@ -1472,12 +1470,10 @@ IF(XNNPACK_BUILD_TESTS)
       x32-packw
       x32-packx
       x32-unpool
-      x32-zip
       x8-lut
       x8-packw
       qs8-packw
       qs8-qc4w-packw
-      x8-zip
       xN-transpose
       xx-fill
       xx-pad)
@@ -1875,7 +1871,6 @@ IF(XNNPACK_BUILD_BENCHMARKS)
     # ---[ Build operator-level microbenchmarks
     SET(LIBRARY_OPERATOR_BENCHMARKS
         average-pooling
-        channel-shuffle
         convolution
         deconvolution
         max-pooling
diff --git a/bench/BUILD.bazel b/bench/BUILD.bazel
index 16eacfc836b..93380b3a908 100644
--- a/bench/BUILD.bazel
+++ b/bench/BUILD.bazel
@@ -608,12 +608,6 @@ xnnpack_benchmark(
     ],
 )
 
-xnnpack_benchmark(
-    name = "channel_shuffle_bench",
-    srcs = ["channel-shuffle.cc"],
-    deps = OPERATOR_BENCHMARK_DEPS,
-)
-
 xnnpack_benchmark(
     name = "convolution_bench",
     srcs = ["convolution.cc"],
diff --git a/bench/channel-shuffle.cc b/bench/channel-shuffle.cc
deleted file mode 100644
index 0a9f820ba3a..00000000000
--- a/bench/channel-shuffle.cc
+++ /dev/null
@@ -1,340 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <algorithm>
-#include <cmath>
-#include <functional>
-#include <limits>
-#include <random>
-#include <vector>
-
-#include "xnnpack.h"
-
-#include <benchmark/benchmark.h>
-#include "utils.h"
-#include "xnnpack/buffer.h"
-
-
-static void channel_shuffle_x8(benchmark::State& state, const char* net) {
-  const size_t batch_size = static_cast<size_t>(state.range(0));
-  const size_t groups = static_cast<size_t>(state.range(1));
-  const size_t group_channels = static_cast<size_t>(state.range(2));
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-
-  xnnpack::Buffer<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + batch_size * groups * group_channels);
-  xnnpack::Buffer<uint8_t> output(batch_size * groups * group_channels);
-  xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng);
-
-  xnn_status status = xnn_initialize(nullptr /* allocator */);
-  if (status != xnn_status_success) {
-    state.SkipWithError("failed to initialize XNNPACK");
-    return;
-  }
-
-  xnn_operator_t channel_shuffle_op = nullptr;
-  status = xnn_create_channel_shuffle_nc_x8(
-    groups, group_channels,
-    groups * group_channels /* input stride */,
-    groups * group_channels /* output stride */,
-    0 /* flags */, &channel_shuffle_op);
-  if (status != xnn_status_success || channel_shuffle_op == nullptr) {
-    state.SkipWithError("failed to create X8 Channel Shuffle operator");
-    return;
-  }
-
-  status = xnn_reshape_channel_shuffle_nc_x8(
-    channel_shuffle_op,
-    batch_size,
-    /*threadpool=*/nullptr);
-  if (status != xnn_status_success) {
-    state.SkipWithError("failed to reshape X8 Channel Shuffle operator");
-    return;
-  }
-
-  status = xnn_setup_channel_shuffle_nc_x8(
-    channel_shuffle_op,
-    input.data(), output.data());
-  if (status != xnn_status_success) {
-    state.SkipWithError("failed to setup X8 Channel Shuffle operator");
-    return;
-  }
-
-  for (auto _ : state) {
-    status = xnn_run_operator(channel_shuffle_op, /*threadpool=*/nullptr);
-    if (status != xnn_status_success) {
-      state.SkipWithError("failed to run X8 Channel Shuffle operator");
-      return;
-    }
-  }
-
-  status = xnn_delete_operator(channel_shuffle_op);
-  if (status != xnn_status_success) {
-    state.SkipWithError("failed to delete X8 Channel Shuffle operator");
-    return;
-  }
-
-  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
-  if (cpu_frequency != 0) {
-    state.counters["cpufreq"] = cpu_frequency;
-  }
-
-  const size_t elements_per_iteration = batch_size * groups * group_channels;
-  state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
-
-  const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(uint8_t);
-  state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
-}
-
-static void channel_shuffle_x32(benchmark::State& state, const char* net) {
-  const size_t batch_size = static_cast<size_t>(state.range(0));
-  const size_t groups = static_cast<size_t>(state.range(1));
-  const size_t group_channels = static_cast<size_t>(state.range(2));
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
-
-  xnnpack::Buffer<float> input(XNN_EXTRA_BYTES / sizeof(float) + batch_size * groups * group_channels);
-  xnnpack::Buffer<float> output(batch_size * groups * group_channels);
-  std::generate(input.begin(), input.end(), std::ref(f32rng));
-
-  xnn_status status = xnn_initialize(nullptr /* allocator */);
-  if (status != xnn_status_success) {
-    state.SkipWithError("failed to initialize XNNPACK");
-    return;
-  }
-
-  xnn_operator_t channel_shuffle_op = nullptr;
-  status = xnn_create_channel_shuffle_nc_x32(
-    groups, group_channels,
-    groups * group_channels /* input stride */,
-    groups * group_channels /* output stride */,
-    0 /* flags */, &channel_shuffle_op);
-  if (status != xnn_status_success || channel_shuffle_op == nullptr) {
-    state.SkipWithError("failed to create X32 Channel Shuffle operator");
-    return;
-  }
-
-  status = xnn_reshape_channel_shuffle_nc_x32(
-    channel_shuffle_op,
-    batch_size,
-    /*threadpool=*/nullptr);
-  if (status != xnn_status_success) {
-    state.SkipWithError("failed to reshape X32 Channel Shuffle operator");
-    return;
-  }
-
-  status = xnn_setup_channel_shuffle_nc_x32(
-    channel_shuffle_op,
-    input.data(), output.data());
-  if (status != xnn_status_success) {
-    state.SkipWithError("failed to setup X32 Channel Shuffle operator");
-    return;
-  }
-
-  for (auto _ : state) {
-    status = xnn_run_operator(channel_shuffle_op, /*threadpool=*/nullptr);
-    if (status != xnn_status_success) {
-      state.SkipWithError("failed to run X32 Channel Shuffle operator");
-      return;
-    }
-  }
-
-  status = xnn_delete_operator(channel_shuffle_op);
-  if (status != xnn_status_success) {
-    state.SkipWithError("failed to delete X32 Channel Shuffle operator");
-    return;
-  }
-
-  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
-  if (cpu_frequency != 0) {
-    state.counters["cpufreq"] = cpu_frequency;
-  }
-
-  const size_t elements_per_iteration = batch_size * groups * group_channels;
-  state.counters["elements"] =
-    benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
-
-  const size_t bytes_per_iteration = 2 * elements_per_iteration * sizeof(float);
-  state.counters["bytes"] =
-    benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
-}
-
-static void ShuffleNetV1G2Arguments(benchmark::internal::Benchmark* b)
-{
-  b->ArgNames({"N", "G", "GC"});
-
-  /******** Stage 2 ********/
-  /*        H    W  G   CG */
-  b->Args({56 * 56, 2,  25});
-  b->Args({28 * 28, 2,  25});
-
-  /******** Stage 3 ********/
-  /*        H    W  G   CG */
-  b->Args({28 * 28, 2,  50});
-  b->Args({14 * 14, 2,  50});
-
-  /******** Stage 4 ********/
-  /*        H    W  G   CG */
-  b->Args({14 * 14, 2, 100});
-  b->Args({ 7 *  7, 2, 100});
-}
-
-static void ShuffleNetV1G3Arguments(benchmark::internal::Benchmark* b)
-{
-  b->ArgNames({"N", "G", "GC"});
-
-  /******** Stage 2 *******/
-  /*        H    W  G  CG */
-  b->Args({56 * 56, 3, 20});
-  b->Args({28 * 28, 3, 20});
-
-  /******** Stage 3 *******/
-  /*        H    W  G  CG */
-  b->Args({28 * 28, 3, 40});
-  b->Args({14 * 14, 3, 40});
-
-  /******** Stage 4 *******/
-  /*        H    W  G  CG */
-  b->Args({14 * 14, 3, 80});
-  b->Args({ 7 *  7, 3, 80});
-}
-
-static void ShuffleNetV1G4Arguments(benchmark::internal::Benchmark* b)
-{
-  b->ArgNames({"N", "G", "GC"});
-
-  /******** Stage 2 *******/
-  /*        H    W  G  CG */
-  b->Args({56 * 56, 4, 17});
-  b->Args({28 * 28, 4, 17});
-
-  /******** Stage 3 *******/
-  /*        H    W  G  CG */
-  b->Args({28 * 28, 4, 34});
-  b->Args({14 * 14, 4, 34});
-
-  /******** Stage 4 *******/
-  /*        H    W  G  CG */
-  b->Args({14 * 14, 4, 68});
-  b->Args({ 7 *  7, 4, 68});
-}
-
-static void ShuffleNetV1G8Arguments(benchmark::internal::Benchmark* b)
-{
-  b->ArgNames({"N", "G", "GC"});
-
-  /******** Stage 2 *******/
-  /*        H    W  G  CG */
-  b->Args({56 * 56, 8, 12});
-  b->Args({28 * 28, 8, 12});
-
-  /******** Stage 3 *******/
-  /*        H    W  G  CG */
-  b->Args({28 * 28, 8, 24});
-  b->Args({14 * 14, 8, 24});
-
-  /******** Stage 4 *******/
-  /*        H    W  G  CG */
-  b->Args({14 * 14, 8, 48});
-  b->Args({ 7 *  7, 8, 48});
-}
-
-static void ShuffleNetV2x0_5Arguments(benchmark::internal::Benchmark* b)
-{
-  b->ArgNames({"N", "G", "GC"});
-
-  /******** Stage 2 *******/
-  /*        H    W  G  CG */
-  b->Args({28 * 28, 2, 24});
-
-  /******** Stage 3 *******/
-  /*        H    W  G  CG */
-  b->Args({14 * 14, 2, 48});
-
-  /******** Stage 4 *******/
-  /*        H    W  G  CG */
-  b->Args({ 7 *  7, 2, 96});
-}
-
-static void ShuffleNetV2x1_0Arguments(benchmark::internal::Benchmark* b)
-{
-  b->ArgNames({"N", "G", "GC"});
-
-  /******** Stage 2 ********/
-  /*        H    W  G   CG */
-  b->Args({28 * 28, 2,  58});
-
-  /******** Stage 3 ********/
-  /*        H    W  G   CG */
-  b->Args({14 * 14, 2, 116});
-
-  /******** Stage 4 ********/
-  /*        H    W  G   CG */
-  b->Args({ 7 *  7, 2, 232});
-}
-
-static void ShuffleNetV2x1_5Arguments(benchmark::internal::Benchmark* b)
-{
-  b->ArgNames({"N", "G", "GC"});
-
-  /******** Stage 2 ********/
-  /*        H    W  G   CG */
-  b->Args({28 * 28, 2,  88});
-
-  /******** Stage 3 ********/
-  /*        H    W  G   CG */
-  b->Args({14 * 14, 2, 176});
-
-  /******** Stage 4 ********/
-  /*        H    W  G   CG */
-  b->Args({ 7 *  7, 2, 352});
-}
-
-static void ShuffleNetV2x2_0Arguments(benchmark::internal::Benchmark* b)
-{
-  b->ArgNames({"N", "G", "GC"});
-
-  /******** Stage 2 ********/
-  /*        H    W  G   CG */
-  b->Args({28 * 28, 2, 122});
-
-  /******** Stage 3 ********/
-  /*        H    W  G   CG */
-  b->Args({14 * 14, 2, 244});
-
-  /******** Stage 4 ********/
-  /*        H    W  G   CG */
-  b->Args({ 7 *  7, 2, 488});
-}
-
-BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x05, "ShuffleNet v2 x0.5")->Apply(ShuffleNetV2x0_5Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x10, "ShuffleNet v2 x1.0")->Apply(ShuffleNetV2x1_0Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x15, "ShuffleNet v2 x1.5")->Apply(ShuffleNetV2x1_5Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x8, shufflenet_v2_x20, "ShuffleNet v2 x2.0")->Apply(ShuffleNetV2x2_0Arguments)->UseRealTime();
-
-BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x05, "ShuffleNet v2 x0.5")->Apply(ShuffleNetV2x0_5Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x10, "ShuffleNet v2 x1.0")->Apply(ShuffleNetV2x1_0Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x15, "ShuffleNet v2 x1.5")->Apply(ShuffleNetV2x1_5Arguments)->UseRealTime();
-BENCHMARK_CAPTURE(channel_shuffle_x32, shufflenet_v2_x20, "ShuffleNet v2 x2.0")->Apply(ShuffleNetV2x2_0Arguments)->UseRealTime();
-
-#ifndef XNNPACK_BENCHMARK_NO_MAIN
-BENCHMARK_MAIN();
-#endif
diff --git a/build_srcs.bzl b/build_srcs.bzl
index bd75d4b55b4..e4c38f759c9 100644
--- a/build_srcs.bzl
+++ b/build_srcs.bzl
@@ -13,7 +13,6 @@ OPERATOR_SRCS = [
     "src/operators/average-pooling-nhwc.c",
     "src/operators/batch-matrix-multiply-nc.c",
     "src/operators/binary-elementwise-nd.c",
-    "src/operators/channel-shuffle-nc.c",
     "src/operators/constant-pad-nd.c",
     "src/operators/convolution-nchw.c",
     "src/operators/convolution-nhwc.c",
@@ -108,7 +107,6 @@ XNNPACK_SRCS = [
     "src/configs/x8-lut-config.c",
     "src/configs/xx-fill-config.c",
     "src/configs/xx-pad-config.c",
-    "src/configs/zip-config.c",
 ]
 
 LOGGING_SRCS = [
diff --git a/cmake/gen/neon_microkernels.cmake b/cmake/gen/neon_microkernels.cmake
index ec7b7a3ade6..6e714381fcb 100644
--- a/cmake/gen/neon_microkernels.cmake
+++ b/cmake/gen/neon_microkernels.cmake
@@ -151,10 +151,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS
   src/u8-rmax/u8-rmax-neon-u16.c
   src/u8-vclamp/u8-vclamp-neon-u64.c
   src/x8-transposec/gen/x8-transposec-16x16-reuse-dec-zip-neon.c
-  src/x8-zip/x8-zip-x2-neon.c
-  src/x8-zip/x8-zip-x3-neon.c
-  src/x8-zip/x8-zip-x4-neon.c
-  src/x8-zip/x8-zip-xm-neon.c
   src/x16-packw/gen/x16-packw-x8-gemm-goi-neon-ld4lane-u8-prfm.c
   src/x16-packw/gen/x16-packw-x16-gemm-goi-neon-ld4lane-u8-prfm.c
   src/x16-transposec/gen/x16-transposec-8x8-reuse-dec-zip-neon.c
@@ -164,10 +160,6 @@ SET(PROD_NEON_MICROKERNEL_SRCS
   src/x32-packw/gen/x32-packw-x8s4-gemm-goi-neon-ld4lane-u4-prfm.c
   src/x32-transposec/gen/x32-transposec-4x4-reuse-dec-zip-neon.c
   src/x32-unpool/x32-unpool-neon.c
-  src/x32-zip/x32-zip-x2-neon.c
-  src/x32-zip/x32-zip-x3-neon.c
-  src/x32-zip/x32-zip-x4-neon.c
-  src/x32-zip/x32-zip-xm-neon.c
   src/x64-transposec/gen/x64-transposec-2x2-multi-dec-zip-neon.c
   src/x64-transposec/gen/x64-transposec-2x2-reuse-dec-zip-neon.c
   src/xx-fill/xx-fill-neon-u64.c
diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake
index 89b5b0fdc0c..a394b9f01ed 100644
--- a/cmake/gen/scalar_microkernels.cmake
+++ b/cmake/gen/scalar_microkernels.cmake
@@ -231,10 +231,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS
   src/x8-packw/gen/x8-packw-x16-gemm-goi-scalar-u2.c
   src/x8-packw/gen/x8-packw-x32-gemm-goi-scalar-u2.c
   src/x8-transposec/gen/x8-transposec-2x4-scalar-int.c
-  src/x8-zip/x8-zip-x2-scalar.c
-  src/x8-zip/x8-zip-x3-scalar.c
-  src/x8-zip/x8-zip-x4-scalar.c
-  src/x8-zip/x8-zip-xm-scalar.c
   src/x16-packw/gen/x16-packw-x64-gemm-goi-scalar-int-u4.c
   src/x16-transposec/gen/x16-transposec-2x4-scalar-int.c
   src/x24-transposec/gen/x24-transposec-1x2-scalar.c
@@ -242,10 +238,6 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS
   src/x32-packw/gen/x32-packw-x4-gemm-goi-scalar-float-u4.c
   src/x32-transposec/gen/x32-transposec-2x4-scalar-int.c
   src/x32-unpool/x32-unpool-scalar.c
-  src/x32-zip/x32-zip-x2-scalar.c
-  src/x32-zip/x32-zip-x3-scalar.c
-  src/x32-zip/x32-zip-x4-scalar.c
-  src/x32-zip/x32-zip-xm-scalar.c
   src/x64-transposec/gen/x64-transposec-4x2-scalar-int.c
   src/xx-copy/xx-copy-scalar-memcpy.c
   src/xx-fill/xx-fill-scalar-u16.c
diff --git a/cmake/gen/sse2_microkernels.cmake b/cmake/gen/sse2_microkernels.cmake
index a6d675da34a..0615ceeec0c 100644
--- a/cmake/gen/sse2_microkernels.cmake
+++ b/cmake/gen/sse2_microkernels.cmake
@@ -86,18 +86,10 @@ SET(PROD_SSE2_MICROKERNEL_SRCS
   src/u8-rmax/u8-rmax-sse2-u16.c
   src/u8-vclamp/u8-vclamp-sse2-u64.c
   src/x8-transposec/gen/x8-transposec-16x16-reuse-mov-sse2.c
-  src/x8-zip/x8-zip-x2-sse2.c
-  src/x8-zip/x8-zip-x3-sse2.c
-  src/x8-zip/x8-zip-x4-sse2.c
-  src/x8-zip/x8-zip-xm-sse2.c
   src/x16-transposec/gen/x16-transposec-8x8-reuse-multi-sse2.c
   src/x32-packw/gen/x32-packw-x2c4-gemm-goi-sse2-u4.c
   src/x32-packw/gen/x32-packw-x8-gemm-goi-sse2-u4.c
   src/x32-unpool/x32-unpool-sse2.c
-  src/x32-zip/x32-zip-x2-sse2.c
-  src/x32-zip/x32-zip-x3-sse2.c
-  src/x32-zip/x32-zip-x4-sse2.c
-  src/x32-zip/x32-zip-xm-sse2.c
   src/x64-transposec/gen/x64-transposec-2x2-multi-mov-sse2.c
   src/xx-fill/xx-fill-sse2-u64.c
   src/xx-pad/xx-pad-p16-sse2-u16.c)
diff --git a/cmake/gen/wasmsimd_microkernels.cmake b/cmake/gen/wasmsimd_microkernels.cmake
index 19100e1bbb4..2d93b7d9717 100644
--- a/cmake/gen/wasmsimd_microkernels.cmake
+++ b/cmake/gen/wasmsimd_microkernels.cmake
@@ -216,10 +216,6 @@ SET(PROD_WASMSIMD_MICROKERNEL_SRCS
   src/x32-packw/gen/x32-packw-x8-gemm-goi-wasmsimd-u4.c
   src/x32-transposec/gen/x32-transposec-4x4-reuse-mov-wasmsimd.c
   src/x32-unpool/x32-unpool-wasmsimd.c
-  src/x32-zip/x32-zip-x2-wasmsimd.c
-  src/x32-zip/x32-zip-x3-wasmsimd.c
-  src/x32-zip/x32-zip-x4-wasmsimd.c
-  src/x32-zip/x32-zip-xm-wasmsimd.c
   src/xx-fill/xx-fill-wasmsimd-u64.c
   src/xx-pad/xx-pad-p16-wasmsimd-u16.c)
 
diff --git a/gen/neon_microkernels.bzl b/gen/neon_microkernels.bzl
index ac25b28f04c..9255594ed8e 100644
--- a/gen/neon_microkernels.bzl
+++ b/gen/neon_microkernels.bzl
@@ -147,10 +147,6 @@ PROD_NEON_MICROKERNEL_SRCS = [
     "src/u8-rmax/u8-rmax-neon-u16.c",
     "src/u8-vclamp/u8-vclamp-neon-u64.c",
     "src/x8-transposec/gen/x8-transposec-16x16-reuse-dec-zip-neon.c",
-    "src/x8-zip/x8-zip-x2-neon.c",
-    "src/x8-zip/x8-zip-x3-neon.c",
-    "src/x8-zip/x8-zip-x4-neon.c",
-    "src/x8-zip/x8-zip-xm-neon.c",
     "src/x16-packw/gen/x16-packw-x8-gemm-goi-neon-ld4lane-u8-prfm.c",
     "src/x16-packw/gen/x16-packw-x16-gemm-goi-neon-ld4lane-u8-prfm.c",
     "src/x16-transposec/gen/x16-transposec-8x8-reuse-dec-zip-neon.c",
@@ -160,10 +156,6 @@ PROD_NEON_MICROKERNEL_SRCS = [
     "src/x32-packw/gen/x32-packw-x8s4-gemm-goi-neon-ld4lane-u4-prfm.c",
     "src/x32-transposec/gen/x32-transposec-4x4-reuse-dec-zip-neon.c",
     "src/x32-unpool/x32-unpool-neon.c",
-    "src/x32-zip/x32-zip-x2-neon.c",
-    "src/x32-zip/x32-zip-x3-neon.c",
-    "src/x32-zip/x32-zip-x4-neon.c",
-    "src/x32-zip/x32-zip-xm-neon.c",
     "src/x64-transposec/gen/x64-transposec-2x2-multi-dec-zip-neon.c",
     "src/x64-transposec/gen/x64-transposec-2x2-reuse-dec-zip-neon.c",
     "src/xx-fill/xx-fill-neon-u64.c",
diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl
index c8cce31c401..2b65594d1c4 100644
--- a/gen/scalar_microkernels.bzl
+++ b/gen/scalar_microkernels.bzl
@@ -227,10 +227,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/x8-packw/gen/x8-packw-x16-gemm-goi-scalar-u2.c",
     "src/x8-packw/gen/x8-packw-x32-gemm-goi-scalar-u2.c",
     "src/x8-transposec/gen/x8-transposec-2x4-scalar-int.c",
-    "src/x8-zip/x8-zip-x2-scalar.c",
-    "src/x8-zip/x8-zip-x3-scalar.c",
-    "src/x8-zip/x8-zip-x4-scalar.c",
-    "src/x8-zip/x8-zip-xm-scalar.c",
     "src/x16-packw/gen/x16-packw-x64-gemm-goi-scalar-int-u4.c",
     "src/x16-transposec/gen/x16-transposec-2x4-scalar-int.c",
     "src/x24-transposec/gen/x24-transposec-1x2-scalar.c",
@@ -238,10 +234,6 @@ PROD_SCALAR_MICROKERNEL_SRCS = [
     "src/x32-packw/gen/x32-packw-x4-gemm-goi-scalar-float-u4.c",
     "src/x32-transposec/gen/x32-transposec-2x4-scalar-int.c",
     "src/x32-unpool/x32-unpool-scalar.c",
-    "src/x32-zip/x32-zip-x2-scalar.c",
-    "src/x32-zip/x32-zip-x3-scalar.c",
-    "src/x32-zip/x32-zip-x4-scalar.c",
-    "src/x32-zip/x32-zip-xm-scalar.c",
     "src/x64-transposec/gen/x64-transposec-4x2-scalar-int.c",
     "src/xx-copy/xx-copy-scalar-memcpy.c",
     "src/xx-fill/xx-fill-scalar-u16.c",
diff --git a/gen/sse2_microkernels.bzl b/gen/sse2_microkernels.bzl
index 8375c7af656..f5d1f36709b 100644
--- a/gen/sse2_microkernels.bzl
+++ b/gen/sse2_microkernels.bzl
@@ -82,18 +82,10 @@ PROD_SSE2_MICROKERNEL_SRCS = [
     "src/u8-rmax/u8-rmax-sse2-u16.c",
     "src/u8-vclamp/u8-vclamp-sse2-u64.c",
     "src/x8-transposec/gen/x8-transposec-16x16-reuse-mov-sse2.c",
-    "src/x8-zip/x8-zip-x2-sse2.c",
-    "src/x8-zip/x8-zip-x3-sse2.c",
-    "src/x8-zip/x8-zip-x4-sse2.c",
-    "src/x8-zip/x8-zip-xm-sse2.c",
     "src/x16-transposec/gen/x16-transposec-8x8-reuse-multi-sse2.c",
     "src/x32-packw/gen/x32-packw-x2c4-gemm-goi-sse2-u4.c",
     "src/x32-packw/gen/x32-packw-x8-gemm-goi-sse2-u4.c",
     "src/x32-unpool/x32-unpool-sse2.c",
-    "src/x32-zip/x32-zip-x2-sse2.c",
-    "src/x32-zip/x32-zip-x3-sse2.c",
-    "src/x32-zip/x32-zip-x4-sse2.c",
-    "src/x32-zip/x32-zip-xm-sse2.c",
     "src/x64-transposec/gen/x64-transposec-2x2-multi-mov-sse2.c",
     "src/xx-fill/xx-fill-sse2-u64.c",
     "src/xx-pad/xx-pad-p16-sse2-u16.c",
diff --git a/gen/wasmsimd_microkernels.bzl b/gen/wasmsimd_microkernels.bzl
index a7fefd4e202..2925413d9d6 100644
--- a/gen/wasmsimd_microkernels.bzl
+++ b/gen/wasmsimd_microkernels.bzl
@@ -212,10 +212,6 @@ PROD_WASMSIMD_MICROKERNEL_SRCS = [
     "src/x32-packw/gen/x32-packw-x8-gemm-goi-wasmsimd-u4.c",
     "src/x32-transposec/gen/x32-transposec-4x4-reuse-mov-wasmsimd.c",
     "src/x32-unpool/x32-unpool-wasmsimd.c",
-    "src/x32-zip/x32-zip-x2-wasmsimd.c",
-    "src/x32-zip/x32-zip-x3-wasmsimd.c",
-    "src/x32-zip/x32-zip-x4-wasmsimd.c",
-    "src/x32-zip/x32-zip-xm-wasmsimd.c",
     "src/xx-fill/xx-fill-wasmsimd-u64.c",
     "src/xx-pad/xx-pad-p16-wasmsimd-u16.c",
 ]
diff --git a/include/xnnpack.h b/include/xnnpack.h
index ad38d79a34a..47ebbfc474e 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -2680,42 +2680,6 @@ enum xnn_status xnn_setup_batch_matrix_multiply_nc_qd8_f32_qc8w(
     const struct xnn_quantization_params* quantization_params,
     float* output);
 
-enum xnn_status xnn_create_channel_shuffle_nc_x8(
-  size_t groups,
-  size_t group_channels,
-  size_t input_stride,
-  size_t output_stride,
-  uint32_t flags,
-  xnn_operator_t* channel_shuffle_op_out);
-
-enum xnn_status xnn_reshape_channel_shuffle_nc_x8(
-  xnn_operator_t channel_shuffle_op,
-  size_t batch_size,
-  pthreadpool_t threadpool);
-
-enum xnn_status xnn_setup_channel_shuffle_nc_x8(
-  xnn_operator_t channel_shuffle_op,
-  const void* input,
-  void* output);
-
-enum xnn_status xnn_create_channel_shuffle_nc_x32(
-  size_t groups,
-  size_t group_channels,
-  size_t input_stride,
-  size_t output_stride,
-  uint32_t flags,
-  xnn_operator_t* channel_shuffle_op_out);
-
-enum xnn_status xnn_reshape_channel_shuffle_nc_x32(
-  xnn_operator_t channel_shuffle_op,
-  size_t batch_size,
-  pthreadpool_t threadpool);
-
-enum xnn_status xnn_setup_channel_shuffle_nc_x32(
-  xnn_operator_t channel_shuffle_op,
-  const void* input,
-  void* output);
-
 enum xnn_status xnn_create_constant_pad_nd_x8(
   const void* padding_value,
   uint32_t flags,
diff --git a/src/configs/zip-config.c b/src/configs/zip-config.c
deleted file mode 100644
index 4a5d3044e6a..00000000000
--- a/src/configs/zip-config.c
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2023 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stddef.h>
-
-#include "xnnpack/common.h"
-#include "xnnpack/config.h"
-#include "xnnpack/init-once.h"
-#include "xnnpack/microfnptr.h"
-#include "xnnpack/zip.h"
-
-static struct xnn_zip_config x8_zip_config = {0};
-static struct xnn_zip_config x32_zip_config = {0};
-
-XNN_INIT_ONCE_GUARD(x8_zip);
-XNN_INIT_ONCE_GUARD(x32_zip);
-
-static void init_x8_zip_config(void) {
-  #if XNN_ARCH_ARM
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->use_arm_neon) {
-      x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__neon;
-      x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__neon;
-      x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__neon;
-      x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__neon;
-    } else if (!XNN_PLATFORM_MOBILE) {
-      x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__scalar;
-      x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__scalar;
-      x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__scalar;
-      x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__scalar;
-    }
-  #elif XNN_ARCH_ARM64
-    x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__neon;
-    x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__neon;
-    x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__neon;
-    x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__neon;
-  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
-    x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__sse2;
-    x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__sse2;
-    x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__sse2;
-    x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__sse2;
-  #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-    x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__scalar;
-    x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__scalar;
-    x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__scalar;
-    x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__scalar;
-  #else
-    x8_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x2_ukernel__scalar;
-    x8_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x3_ukernel__scalar;
-    x8_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x8_zip_x4_ukernel__scalar;
-    x8_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x8_zip_xm_ukernel__scalar;
-  #endif
-
-}
-
-static void init_x32_zip_config(void) {
-  #if XNN_ARCH_ARM
-    const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-    assert(hardware_config != NULL);
-    if (hardware_config->use_arm_neon) {
-      x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__neon;
-      x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__neon;
-      x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__neon;
-      x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__neon;
-    } else if (!XNN_PLATFORM_MOBILE) {
-      x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__scalar;
-      x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__scalar;
-      x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__scalar;
-      x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__scalar;
-    }
-  #elif XNN_ARCH_ARM64
-    x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__neon;
-    x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__neon;
-    x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__neon;
-    x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__neon;
-  #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
-    x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__sse2;
-    x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__sse2;
-    x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__sse2;
-    x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__sse2;
-  #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
-    x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__wasmsimd;
-    x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__wasmsimd;
-    x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__wasmsimd;
-    x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__wasmsimd;
-  #else
-    x32_zip_config.x2 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x2_ukernel__scalar;
-    x32_zip_config.x3 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x3_ukernel__scalar;
-    x32_zip_config.x4 = (xnn_zipc_ukernel_fn) xnn_x32_zip_x4_ukernel__scalar;
-    x32_zip_config.xm = (xnn_zipv_ukernel_fn) xnn_x32_zip_xm_ukernel__scalar;
-  #endif
-
-}
-
-const struct xnn_zip_config* xnn_init_x8_zip_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-  if (hardware_config == NULL) {
-    return NULL;
-  }
-  XNN_INIT_ONCE(x8_zip);
-  return &x8_zip_config;
-}
-
-const struct xnn_zip_config* xnn_init_x32_zip_config() {
-  const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config();
-  if (hardware_config == NULL) {
-    return NULL;
-  }
-  XNN_INIT_ONCE(x32_zip);
-  return &x32_zip_config;
-}
diff --git a/src/operator-run.c b/src/operator-run.c
index 4dc666279f4..30616f8576e 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c
@@ -1958,26 +1958,6 @@ void xnn_compute_elementwise_binary_5d(
   context->ukernel(context->elements, a, b, y, &context->params);
 }
 
-void xnn_compute_channel_shuffle_fixed(
-    const struct channel_shuffle_context context[restrict XNN_MIN_ELEMENTS(1)],
-    size_t index)
-{
-  const void* x = (const void*) ((uintptr_t) context->x + index * context->x_stride);
-  void* y = (void*) ((uintptr_t) context->y + index * context->y_stride);
-
-  context->fixed_ukernel(context->n, x, y);
-}
-
-void xnn_compute_channel_shuffle_variable(
-    const struct channel_shuffle_context context[restrict XNN_MIN_ELEMENTS(1)],
-    size_t index)
-{
-  const void* x = (const void*) ((uintptr_t) context->x + index * context->x_stride);
-  void* y = (void*) ((uintptr_t) context->y + index * context->y_stride);
-
-  context->variable_ukernel(context->n, context->m, x, y);
-}
-
 void xnn_compute_lut_strided(
     const struct lut_strided_context context[restrict XNN_MIN_ELEMENTS(1)],
     size_t batch_index)
diff --git a/src/operators/channel-shuffle-nc.c b/src/operators/channel-shuffle-nc.c
deleted file mode 100644
index 78ab65d8d70..00000000000
--- a/src/operators/channel-shuffle-nc.c
+++ /dev/null
@@ -1,311 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include "xnnpack.h"
-#include "xnnpack/allocator.h"
-#include "xnnpack/common.h"
-#include "xnnpack/compute.h"
-#include "xnnpack/config-types.h"
-#include "xnnpack/config.h"
-#include "xnnpack/log.h"
-#include "xnnpack/operator-type.h"
-#include "xnnpack/operator.h"
-#include "xnnpack/params.h"
-#include "pthreadpool.h"
-
-static enum xnn_status create_channel_shuffle_nc(
-  size_t groups,
-  size_t group_channels,
-  size_t input_stride,
-  size_t output_stride,
-  uint32_t flags,
-  const struct xnn_zip_config* zip_config,
-  enum xnn_operator_type operator_type,
-  xnn_operator_t* channel_shuffle_op_out)
-{
-  xnn_operator_t channel_shuffle_op = NULL;
-  enum xnn_status status = xnn_status_uninitialized;
-
-  if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
-    xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(operator_type));
-    goto error;
-  }
-
-  status = xnn_status_invalid_parameter;
-
-  if (groups <= 1) {
-    xnn_log_error(
-      "failed to create %s operator with %zu groups: at least two groups required",
-      xnn_operator_type_to_string(operator_type), groups);
-    goto error;
-  }
-
-  if (group_channels == 0) {
-    xnn_log_error(
-      "failed to create %s operator with %zu group channels: number of group channels must be non-zero",
-      xnn_operator_type_to_string(operator_type), group_channels);
-    goto error;
-  }
-
-  const size_t channels = groups * group_channels;
-  if (input_stride < channels) {
-    xnn_log_error(
-      "failed to create %s operator with input element stride of %zu: "
-      "stride must be at least as large as the number of channels (%zux%zu)",
-      xnn_operator_type_to_string(operator_type), input_stride, groups, group_channels);
-    goto error;
-  }
-
-  if (output_stride < channels) {
-    xnn_log_error(
-      "failed to create %s operator with output element stride of %zu: "
-      "stride must be at least as large as the number of channels (%zux%zu)",
-      xnn_operator_type_to_string(operator_type), output_stride, groups, group_channels);
-    goto error;
-  }
-
-  status = xnn_status_out_of_memory;
-
-  channel_shuffle_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
-  if (channel_shuffle_op == NULL) {
-    xnn_log_error(
-      "failed to allocate %zu bytes for %s operator descriptor",
-      sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
-    goto error;
-  }
-
-  channel_shuffle_op->groups = groups;
-  channel_shuffle_op->group_channels = group_channels;
-  channel_shuffle_op->input_pixel_stride = input_stride;
-  channel_shuffle_op->output_pixel_stride = output_stride;
-
-  channel_shuffle_op->type = operator_type;
-  channel_shuffle_op->flags = flags;
-  channel_shuffle_op->zip_config = zip_config;
-
-  channel_shuffle_op->state = xnn_run_state_invalid;
-
-  *channel_shuffle_op_out = channel_shuffle_op;
-  return xnn_status_success;
-
-error:
-  xnn_delete_operator(channel_shuffle_op);
-  return status;
-}
-
-
-enum xnn_status xnn_create_channel_shuffle_nc_x8(
-    size_t groups,
-    size_t group_channels,
-    size_t input_stride,
-    size_t output_stride,
-    uint32_t flags,
-    xnn_operator_t* channel_shuffle_op_out)
-{
-  const struct xnn_zip_config* zip_config = xnn_init_x8_zip_config();
-  assert(zip_config != NULL);
-  return create_channel_shuffle_nc(
-    groups,
-    group_channels,
-    input_stride,
-    output_stride,
-    flags,
-    zip_config,
-    xnn_operator_type_channel_shuffle_nc_x8,
-    channel_shuffle_op_out);
-}
-
-enum xnn_status xnn_create_channel_shuffle_nc_x32(
-    size_t groups,
-    size_t group_channels,
-    size_t input_stride,
-    size_t output_stride,
-    uint32_t flags,
-    xnn_operator_t* channel_shuffle_op_out)
-{
-  const struct xnn_zip_config* zip_config = xnn_init_x32_zip_config();
-  if (zip_config == NULL) {
-    xnn_log_error(
-      "failed to create %s operator: unsupported hardware configuration",
-      xnn_operator_type_to_string(xnn_operator_type_channel_shuffle_nc_x32));
-    return xnn_status_unsupported_hardware;
-  }
-  return create_channel_shuffle_nc(
-    groups,
-    group_channels,
-    input_stride,
-    output_stride,
-    flags,
-    zip_config,
-    xnn_operator_type_channel_shuffle_nc_x32,
-    channel_shuffle_op_out);
-}
-
-static enum xnn_status reshape_channel_shuffle_nc(
-    xnn_operator_t channel_shuffle_op,
-    size_t batch_size,
-    uint32_t log2_element_size,
-    const struct xnn_zip_config zip[restrict XNN_MIN_ELEMENTS(1)])
-{
-  channel_shuffle_op->state = xnn_run_state_invalid;
-
-  if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
-    xnn_log_error("failed to reshape %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(channel_shuffle_op->type));
-    return xnn_status_uninitialized;
-  }
-
-  if (batch_size == 0) {
-    channel_shuffle_op->state = xnn_run_state_skip;
-    return xnn_status_success;
-  }
-
-  channel_shuffle_op->batch_size = batch_size;
-
-  const size_t groups = channel_shuffle_op->groups;
-  channel_shuffle_op->context.channel_shuffle = (struct channel_shuffle_context) {
-    .x_stride = channel_shuffle_op->input_pixel_stride << log2_element_size,
-    .y_stride = channel_shuffle_op->output_pixel_stride << log2_element_size,
-    .n = channel_shuffle_op->group_channels << log2_element_size,
-    .m = groups,
-  };
-  channel_shuffle_op->compute[0].type = xnn_parallelization_type_1d;
-  channel_shuffle_op->compute[0].range[0] = batch_size;
-  switch (groups) {
-    case 2:
-      channel_shuffle_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_channel_shuffle_fixed;
-      channel_shuffle_op->context.channel_shuffle.fixed_ukernel = zip->x2;
-      break;
-    case 3:
-      channel_shuffle_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_channel_shuffle_fixed;
-      channel_shuffle_op->context.channel_shuffle.fixed_ukernel = zip->x3;
-      break;
-    case 4:
-      channel_shuffle_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_channel_shuffle_fixed;
-      channel_shuffle_op->context.channel_shuffle.fixed_ukernel = zip->x4;
-      break;
-    default:
-      channel_shuffle_op->compute[0].task_1d = (pthreadpool_task_1d_t) xnn_compute_channel_shuffle_variable;
-      channel_shuffle_op->context.channel_shuffle.variable_ukernel = zip->xm;
-      break;
-    case 0:
-    case 1:
-      XNN_UNREACHABLE;
-  }
-  channel_shuffle_op->state = xnn_run_state_needs_setup;
-
-  return xnn_status_success;
-}
-
-enum xnn_status xnn_reshape_channel_shuffle_nc_x8(
-    xnn_operator_t channel_shuffle_op,
-    size_t batch_size,
-    pthreadpool_t threadpool)
-{
-  if (channel_shuffle_op->type != xnn_operator_type_channel_shuffle_nc_x8) {
-    xnn_log_error("failed to reshape operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(xnn_operator_type_channel_shuffle_nc_x8),
-      xnn_operator_type_to_string(channel_shuffle_op->type));
-    return xnn_status_invalid_parameter;
-  }
-
-  return reshape_channel_shuffle_nc(
-    channel_shuffle_op,
-    batch_size,
-    /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT8_T,
-    channel_shuffle_op->zip_config);
-}
-
-enum xnn_status xnn_reshape_channel_shuffle_nc_x32(
-    xnn_operator_t channel_shuffle_op,
-    size_t batch_size,
-    pthreadpool_t threadpool)
-{
-  if (channel_shuffle_op->type != xnn_operator_type_channel_shuffle_nc_x32) {
-    xnn_log_error("failed to reshape operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(xnn_operator_type_channel_shuffle_nc_x32),
-      xnn_operator_type_to_string(channel_shuffle_op->type));
-    return xnn_status_invalid_parameter;
-  }
-
-  return reshape_channel_shuffle_nc(
-    channel_shuffle_op,
-    batch_size,
-    /*log2_element_size=*/XNN_LOG2_SIZEOF_UINT32_T,
-    channel_shuffle_op->zip_config);
-}
-
-static enum xnn_status setup_channel_shuffle_nc(
-    xnn_operator_t channel_shuffle_op,
-    const void* input,
-    void* output)
-{
-  switch (channel_shuffle_op->state) {
-    case xnn_run_state_skip:
-      return xnn_status_success;
-    case xnn_run_state_invalid:
-      xnn_log_error(
-        "failed to setup %s operator: operator has not been reshaped yet",
-        xnn_operator_type_to_string(channel_shuffle_op->type));
-      return xnn_status_invalid_state;
-    case xnn_run_state_needs_setup:
-      // Operator has been reshaped, but not setup, continue with setup.
-    case xnn_run_state_ready:
-      // Operator has been reshaped, and we are setting up with different pointers.
-      break;
-  }
-
-  channel_shuffle_op->context.channel_shuffle.x = input;
-  channel_shuffle_op->context.channel_shuffle.y = output;
-
-  channel_shuffle_op->state = xnn_run_state_ready;
-
-  return xnn_status_success;
-}
-
-enum xnn_status xnn_setup_channel_shuffle_nc_x8(
-    xnn_operator_t channel_shuffle_op,
-    const void* input,
-    void* output)
-{
-  if (channel_shuffle_op->type != xnn_operator_type_channel_shuffle_nc_x8) {
-    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(xnn_operator_type_channel_shuffle_nc_x8),
-      xnn_operator_type_to_string(channel_shuffle_op->type));
-    return xnn_status_invalid_parameter;
-  }
-
-  return setup_channel_shuffle_nc(
-    channel_shuffle_op,
-    input,
-    output);
-}
-
-enum xnn_status xnn_setup_channel_shuffle_nc_x32(
-    xnn_operator_t channel_shuffle_op,
-    const void* input,
-    void* output)
-{
-  if (channel_shuffle_op->type != xnn_operator_type_channel_shuffle_nc_x32) {
-    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(xnn_operator_type_channel_shuffle_nc_x32),
-      xnn_operator_type_to_string(channel_shuffle_op->type));
-    return xnn_status_invalid_parameter;
-  }
-
-  return setup_channel_shuffle_nc(
-    channel_shuffle_op,
-    input,
-    output);
-}
diff --git a/src/x32-zip/x32-zip-x2-neon.c b/src/x32-zip/x32-zip-x2-neon.c
deleted file mode 100644
index d56f32fa046..00000000000
--- a/src/x32-zip/x32-zip-x2-neon.c
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x2_ukernel__neon(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-
-  const uint32_t* x = input;
-  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
-  uint32_t* o = output;
-
-  while (n >= 16) {
-    uint32x4x2_t vxy;
-    vxy.val[0] = vld1q_u32(x); x += 4;
-    vxy.val[1] = vld1q_u32(y); y += 4;
-    vst2q_u32(o, vxy); o += 8;
-    n -= 16;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    if (n & 8) {
-      uint32x2x2_t vxy;
-      vxy.val[0] = vld1_u32(x); x += 2;
-      vxy.val[1] = vld1_u32(y); y += 2;
-      vst2_u32(o, vxy); o += 4;
-    }
-    if (n & 4) {
-      uint32x2_t vxy = vld1_dup_u32(x);
-      vxy = vld1_lane_u32(y, vxy, 1);
-      vst1_u32(o, vxy);
-    }
-  }
-}
diff --git a/src/x32-zip/x32-zip-x2-scalar.c b/src/x32-zip/x32-zip-x2-scalar.c
deleted file mode 100644
index f6e3c86b1f4..00000000000
--- a/src/x32-zip/x32-zip-x2-scalar.c
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x2_ukernel__scalar(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-
-  const uint32_t* x = input;
-  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
-
-  do {
-    const uint32_t vx = *x++;
-    const uint32_t vy = *y++;
-    output[0] = vx;
-    output[1] = vy;
-    output += 2;
-
-    n -= 4;
-  } while (n != 0);
-}
diff --git a/src/x32-zip/x32-zip-x2-sse2.c b/src/x32-zip/x32-zip-x2-sse2.c
deleted file mode 100644
index 548976c41ff..00000000000
--- a/src/x32-zip/x32-zip-x2-sse2.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x2_ukernel__sse2(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-
-  const uint32_t* x = input;
-  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
-  uint32_t* o = output;
-
-  while (n >= 16) {
-    const __m128i vx = _mm_loadu_si128((const __m128i*) x);
-    x += 4;
-    const __m128i vy = _mm_loadu_si128((const __m128i*) y);
-    y += 4;
-    const __m128i vxy_lo = _mm_unpacklo_epi32(vx, vy);
-    const __m128i vxy_hi = _mm_unpackhi_epi32(vx, vy);
-    _mm_storeu_si128((__m128i*) o, vxy_lo);
-    _mm_storeu_si128((__m128i*) (o + 4), vxy_hi);
-    o += 8;
-    n -= 16;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    if (n & 8) {
-      const __m128i vx = _mm_loadl_epi64((const __m128i*) x);
-      x += 2;
-      const __m128i vy = _mm_loadl_epi64((const __m128i*) y);
-      y += 2;
-      const __m128i vxy = _mm_unpacklo_epi32(vx, vy);
-      _mm_storeu_si128((__m128i*) o, vxy);
-      o += 4;
-    }
-    if (n & 4) {
-      const uint32_t vx = *x;
-      const uint32_t vy = *y;
-      o[0] = vx;
-      o[1] = vy;
-    }
-  }
-}
diff --git a/src/x32-zip/x32-zip-x2-wasmsimd.c b/src/x32-zip/x32-zip-x2-wasmsimd.c
deleted file mode 100644
index f2478be361b..00000000000
--- a/src/x32-zip/x32-zip-x2-wasmsimd.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x2_ukernel__wasmsimd(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % sizeof(uint32_t) == 0);
-
-  const float* x = (const float*) input;
-  const float* y = (const float*) ((uintptr_t) x + n);
-  float* o = (float*) output;
-
-  while (n >= 4 * sizeof(uint32_t)) {
-    const v128_t vx = wasm_v128_load(x);
-    x += 4;
-    const v128_t vy = wasm_v128_load(y);
-    y += 4;
-    const v128_t vxy_lo = wasm_v32x4_shuffle(vx, vy, 0, 4, 1, 5);
-    const v128_t vxy_hi = wasm_v32x4_shuffle(vx, vy, 2, 6, 3, 7);
-    wasm_v128_store(o, vxy_lo);
-    wasm_v128_store(o + 4, vxy_hi);
-    o += 8;
-    n -= 4 * sizeof(uint32_t);
-  }
-  if XNN_UNLIKELY(n != 0) {
-    if (n & (2 * sizeof(uint32_t))) {
-      const double vx = *((const double*) x);
-      x += 2;
-      const double vy = *((const double*) y);
-      y += 2;
-      const v128_t vxy = wasm_f64x2_make(vx, vy);
-      wasm_v128_store(o, wasm_v32x4_shuffle(vxy, vxy, 0, 2, 1, 3));
-      o += 4;
-    }
-    if (n & (1 * sizeof(uint32_t))) {
-      const float vx = *x;
-      const float vy = *y;
-      o[0] = vx;
-      o[1] = vy;
-    }
-  }
-}
diff --git a/src/x32-zip/x32-zip-x3-neon.c b/src/x32-zip/x32-zip-x3-neon.c
deleted file mode 100644
index 8ca3baa1291..00000000000
--- a/src/x32-zip/x32-zip-x3-neon.c
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x3_ukernel__neon(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-
-  const uint32_t* x = input;
-  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
-  const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
-  uint32_t* o = output;
-
-  while (n >= 16) {
-    uint32x4x3_t vxyz;
-    vxyz.val[0] = vld1q_u32(x); x += 4;
-    vxyz.val[1] = vld1q_u32(y); y += 4;
-    vxyz.val[2] = vld1q_u32(z); z += 4;
-    vst3q_u32(o, vxyz); o += 12;
-    n -= 16;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    if (n & 8) {
-      uint32x2x3_t vxyz;
-      vxyz.val[0] = vld1_u32(x); x += 2;
-      vxyz.val[1] = vld1_u32(y); y += 2;
-      vxyz.val[2] = vld1_u32(z); z += 2;
-      vst3_u32(o, vxyz); o += 6;
-    }
-    if (n & 4) {
-      uint32x2_t vxy = vld1_dup_u32(x);
-      const uint32x2_t vz = vld1_dup_u32(z);
-      vxy = vld1_lane_u32(y, vxy, 1);
-      vst1_u32(o, vxy); o += 2;
-      vst1_lane_u32(o, vz, 0);
-    }
-  }
-}
diff --git a/src/x32-zip/x32-zip-x3-scalar.c b/src/x32-zip/x32-zip-x3-scalar.c
deleted file mode 100644
index 9a7cc7a93d9..00000000000
--- a/src/x32-zip/x32-zip-x3-scalar.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x3_ukernel__scalar(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-
-  const uint32_t* x = input;
-  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
-  const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
-  uint32_t* o = output;
-
-  do {
-    const uint32_t vx = *x++;
-    const uint32_t vy = *y++;
-    const uint32_t vz = *z++;
-    o[0] = vx;
-    o[1] = vy;
-    o[2] = vz;
-    o += 3;
-
-    n -= 4;
-  } while (n != 0);
-}
diff --git a/src/x32-zip/x32-zip-x3-sse2.c b/src/x32-zip/x32-zip-x3-sse2.c
deleted file mode 100644
index bef22257473..00000000000
--- a/src/x32-zip/x32-zip-x3-sse2.c
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x3_ukernel__sse2(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-
-  const float* x = (const float*) input;
-  const float* y = (const float*) ((uintptr_t) x + n);
-  const float* z = (const float*) ((uintptr_t) y + n);
-  float* o = (float*) output;
-
-  while (n >= 16) {
-    // vx = ( x3, x2, x1, x0 )
-    const __m128 vx = _mm_loadu_ps(x);
-    x += 4;
-    // vy = ( y3, y2, y1, y0 )
-    const __m128 vy = _mm_loadu_ps(y);
-    y += 4;
-    // vz = ( z3, z2, z1, z0 )
-    const __m128 vz = _mm_loadu_ps(z);
-    z += 4;
-
-    // vxy = ( y2, y0, x2, x0 )
-    const __m128 vxy = _mm_shuffle_ps(vx, vy, _MM_SHUFFLE(2, 0, 2, 0));
-    // vyz = ( z3, z1, y3, y1 )
-    const __m128 vyz = _mm_shuffle_ps(vy, vz, _MM_SHUFFLE(3, 1, 3, 1));
-    // vzx = ( x3, x1, z2, z0 )
-    const __m128 vzx = _mm_shuffle_ps(vz, vx, _MM_SHUFFLE(3, 1, 2, 0));
-
-    // vxyz0 = ( x1, z0, y0, x0 )
-    const __m128 vxyz0 = _mm_shuffle_ps(vxy, vzx, _MM_SHUFFLE(2, 0, 2, 0));
-    // vxyz1 = ( y2, x2, z1, y1 )
-    const __m128 vxyz1 = _mm_shuffle_ps(vyz, vxy, _MM_SHUFFLE(3, 1, 2, 0));
-    // vxyz2 = ( z3, y3, x3, z2 )
-    const __m128 vxyz2 = _mm_shuffle_ps(vzx, vyz, _MM_SHUFFLE(3, 1, 3, 1));
-
-    _mm_storeu_ps(o, vxyz0);
-    _mm_storeu_ps(o + 4, vxyz1);
-    _mm_storeu_ps(o + 8, vxyz2);
-    o += 12;
-    n -= 16;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    if (n & 8) {
-      // vx = ( -, -, x1, x0 )
-      const __m128 vx = _mm_castpd_ps(_mm_load_sd((const double*) x));
-      x += 2;
-      // vy = ( -, -, y1, y0 )
-      const __m128 vy = _mm_castpd_ps(_mm_load_sd((const double*) y));
-      y += 2;
-      // vz = ( -, -, z1, z0 )
-      const __m128 vz = _mm_castpd_ps(_mm_load_sd((const double*) z));
-      z += 2;
-
-      // vxy = ( y1, x1, y0, x0 )
-      const __m128 vxy = _mm_unpacklo_ps(vx, vy);
-      // vzx = ( x1, z1, x0, z0 )
-      const __m128 vzx = _mm_unpacklo_ps(vz, vx);
-      // vyz = ( z1, y1, z0, y0 )
-      const __m128 vyz = _mm_unpacklo_ps(vy, vz);
-
-      _mm_storeu_ps(o, _mm_shuffle_ps(vxy, vzx, _MM_SHUFFLE(3, 0, 1, 0)));
-      _mm_storeh_pi((__m64*) (o + 4), vyz);
-      o += 6;
-    }
-    if (n & 4) {
-      const __m128 vx = _mm_load_ss(x);
-      const __m128 vy = _mm_load_ss(y);
-      const __m128 vz = _mm_load_ss(z);
-      _mm_store_ss(o, vx);
-      _mm_store_ss(o + 1, vy);
-      _mm_store_ss(o + 2, vz);
-    }
-  }
-}
diff --git a/src/x32-zip/x32-zip-x3-wasmsimd.c b/src/x32-zip/x32-zip-x3-wasmsimd.c
deleted file mode 100644
index 3aac4b632c7..00000000000
--- a/src/x32-zip/x32-zip-x3-wasmsimd.c
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x3_ukernel__wasmsimd(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % sizeof(uint32_t) == 0);
-
-  const float* x = (const float*) input;
-  const float* y = (const float*) ((uintptr_t) x + n);
-  const float* z = (const float*) ((uintptr_t) y + n);
-  float* o = (float*) output;
-
-  while (n >= 4 * sizeof(uint32_t)) {
-    // vx = ( x3, x2, x1, x0 )
-    const v128_t vx = wasm_v128_load(x);
-    x += 4;
-    // vy = ( y3, y2, y1, y0 )
-    const v128_t vy = wasm_v128_load(y);
-    y += 4;
-    // vz = ( z3, z2, z1, z0 )
-    const v128_t vz = wasm_v128_load(z);
-    z += 4;
-
-    // vxy = ( y2, y0, x2, x0 )
-    const v128_t vxy = wasm_v32x4_shuffle(vx, vy, 0, 2, 4, 6);
-    // vyz = ( z3, z1, y3, y1 )
-    const v128_t vyz = wasm_v32x4_shuffle(vy, vz, 1, 3, 5, 7);
-    // vzx = ( x3, x1, z2, z0 )
-    const v128_t vzx = wasm_v32x4_shuffle(vz, vx, 0, 2, 5, 7);
-
-    // vxyz0 = ( x1, z0, y0, x0 )
-    const v128_t vxyz0 = wasm_v32x4_shuffle(vxy, vzx, 0, 2, 4, 6);
-    // vxyz1 = ( y2, x2, z1, y1 )
-    const v128_t vxyz1 = wasm_v32x4_shuffle(vyz, vxy, 0, 2, 5, 7);
-    // vxyz2 = ( z3, y3, x3, z2 )
-    const v128_t vxyz2 = wasm_v32x4_shuffle(vzx, vyz, 1, 3, 5, 7);
-
-    wasm_v128_store(o, vxyz0);
-    wasm_v128_store(o + 4, vxyz1);
-    wasm_v128_store(o + 8, vxyz2);
-    o += 12;
-    n -= 4 * sizeof(uint32_t);
-  }
-  if XNN_UNLIKELY(n != 0) {
-    do {
-      const float vx = *x++;
-      const float vy = *y++;
-      const float vz = *z++;
-      o[0] = vx;
-      o[1] = vy;
-      o[2] = vz;
-      o += 3;
-      n -= sizeof(uint32_t);
-    } while (n != 0);
-  }
-}
diff --git a/src/x32-zip/x32-zip-x4-neon.c b/src/x32-zip/x32-zip-x4-neon.c
deleted file mode 100644
index ef9f54b5afa..00000000000
--- a/src/x32-zip/x32-zip-x4-neon.c
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x4_ukernel__neon(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-
-  const uint32_t* x = input;
-  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
-  const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
-  const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n);
-  uint32_t* o = output;
-
-  while (n >= 16) {
-    uint32x4x4_t vxyzw;
-    vxyzw.val[0] = vld1q_u32(x); x += 4;
-    vxyzw.val[1] = vld1q_u32(y); y += 4;
-    vxyzw.val[2] = vld1q_u32(z); z += 4;
-    vxyzw.val[3] = vld1q_u32(w); w += 4;
-    vst4q_u32(o, vxyzw); o += 16;
-    n -= 16;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    if (n & 8) {
-      uint32x2x4_t vxyzw;
-      vxyzw.val[0] = vld1_u32(x); x += 2;
-      vxyzw.val[1] = vld1_u32(y); y += 2;
-      vxyzw.val[2] = vld1_u32(z); z += 2;
-      vxyzw.val[3] = vld1_u32(w); w += 2;
-      vst4_u32(o, vxyzw); o += 8;
-    }
-    if (n & 4) {
-      uint32x4_t vxyzw = vld1q_dup_u32(x);
-      vxyzw = vld1q_lane_u32(y, vxyzw, 1);
-      vxyzw = vld1q_lane_u32(z, vxyzw, 2);
-      vxyzw = vld1q_lane_u32(w, vxyzw, 3);
-      vst1q_u32(o, vxyzw);
-    }
-  }
-}
diff --git a/src/x32-zip/x32-zip-x4-scalar.c b/src/x32-zip/x32-zip-x4-scalar.c
deleted file mode 100644
index 73b36443c1b..00000000000
--- a/src/x32-zip/x32-zip-x4-scalar.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x4_ukernel__scalar(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-
-  const uint32_t* x = input;
-  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
-  const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
-  const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n);
-  uint32_t* o = output;
-
-  do {
-    const uint32_t vx = *x++;
-    const uint32_t vy = *y++;
-    const uint32_t vz = *z++;
-    const uint32_t vw = *w++;
-    o[0] = vx;
-    o[1] = vy;
-    o[2] = vz;
-    o[3] = vw;
-    o += 4;
-
-    n -= 4;
-  } while (n != 0);
-}
diff --git a/src/x32-zip/x32-zip-x4-sse2.c b/src/x32-zip/x32-zip-x4-sse2.c
deleted file mode 100644
index 82245e7b8d8..00000000000
--- a/src/x32-zip/x32-zip-x4-sse2.c
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x4_ukernel__sse2(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-
-  const uint32_t* x = input;
-  const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
-  const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
-  const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n);
-  uint32_t* o = output;
-
-  while (n >= 16) {
-    const __m128i vx = _mm_loadu_si128((const __m128i*) x);
-    x += 4;
-    const __m128i vy = _mm_loadu_si128((const __m128i*) y);
-    y += 4;
-    const __m128i vz = _mm_loadu_si128((const __m128i*) z);
-    z += 4;
-    const __m128i vw = _mm_loadu_si128((const __m128i*) w);
-    w += 4;
-
-    const __m128i vxy_lo = _mm_unpacklo_epi32(vx, vy);
-    const __m128i vxy_hi = _mm_unpackhi_epi32(vx, vy);
-    const __m128i vzw_lo = _mm_unpacklo_epi32(vz, vw);
-    const __m128i vzw_hi = _mm_unpackhi_epi32(vz, vw);
-
-    const __m128i vxyzw0 = _mm_unpacklo_epi64(vxy_lo, vzw_lo);
-    const __m128i vxyzw1 = _mm_unpackhi_epi64(vxy_lo, vzw_lo);
-    const __m128i vxyzw2 = _mm_unpacklo_epi64(vxy_hi, vzw_hi);
-    const __m128i vxyzw3 = _mm_unpackhi_epi64(vxy_hi, vzw_hi);
-
-    _mm_storeu_si128((__m128i*) o, vxyzw0);
-    _mm_storeu_si128((__m128i*) (o + 4), vxyzw1);
-    _mm_storeu_si128((__m128i*) (o + 8), vxyzw2);
-    _mm_storeu_si128((__m128i*) (o + 12), vxyzw3);
-    o += 16;
-    n -= 16;
-  }
-  if XNN_UNLIKELY(n != 0) {
-    if (n & 8) {
-      const __m128i vx = _mm_loadl_epi64((const __m128i*) x);
-      x += 2;
-      const __m128i vy = _mm_loadl_epi64((const __m128i*) y);
-      y += 2;
-      const __m128i vz = _mm_loadl_epi64((const __m128i*) z);
-      z += 2;
-      const __m128i vw = _mm_loadl_epi64((const __m128i*) w);
-      w += 2;
-
-      const __m128i vxy = _mm_unpacklo_epi32(vx, vy);
-      const __m128i vzw = _mm_unpacklo_epi32(vz, vw);
-
-      const __m128i vxyzw_lo = _mm_unpacklo_epi64(vxy, vzw);
-      const __m128i vxyzw_hi = _mm_unpackhi_epi64(vxy, vzw);
-
-      _mm_storeu_si128((__m128i*) o, vxyzw_lo);
-      _mm_storeu_si128((__m128i*) (o + 4), vxyzw_hi);
-      o += 8;
-    }
-    if (n & 4) {
-      const uint32_t vx = *x;
-      const uint32_t vy = *y;
-      const uint32_t vz = *z;
-      const uint32_t vw = *w;
-      o[0] = vx;
-      o[1] = vy;
-      o[2] = vz;
-      o[3] = vw;
-    }
-  }
-}
diff --git a/src/x32-zip/x32-zip-x4-wasmsimd.c b/src/x32-zip/x32-zip-x4-wasmsimd.c
deleted file mode 100644
index 74dd3599461..00000000000
--- a/src/x32-zip/x32-zip-x4-wasmsimd.c
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_x4_ukernel__wasmsimd(
-    size_t n,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % sizeof(uint32_t) == 0);
-
-  const float* x = (const float*) input;
-  const float* y = (const float*) ((uintptr_t) x + n);
-  const float* z = (const float*) ((uintptr_t) y + n);
-  const float* w = (const float*) ((uintptr_t) z + n);
-  float* o = (float*) output;
-
-  while (n >= 4 * sizeof(uint32_t)) {
-    const v128_t vx = wasm_v128_load(x);
-    x += 4;
-    const v128_t vy = wasm_v128_load(y);
-    y += 4;
-    const v128_t vz = wasm_v128_load(z);
-    z += 4;
-    const v128_t vw = wasm_v128_load(w);
-    w += 4;
-
-    const v128_t vxy_lo = wasm_v32x4_shuffle(vx, vy, 0, 4, 1, 5);
-    const v128_t vxy_hi = wasm_v32x4_shuffle(vx, vy, 2, 6, 3, 7);
-    const v128_t vzw_lo = wasm_v32x4_shuffle(vz, vw, 0, 4, 1, 5);
-    const v128_t vzw_hi = wasm_v32x4_shuffle(vz, vw, 2, 6, 3, 7);
-
-    const v128_t vxyzw0 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 0, 1, 4, 5);
-    const v128_t vxyzw1 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 2, 3, 6, 7);
-    const v128_t vxyzw2 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 0, 1, 4, 5);
-    const v128_t vxyzw3 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 2, 3, 6, 7);
-
-    wasm_v128_store(o, vxyzw0);
-    wasm_v128_store(o + 4, vxyzw1);
-    wasm_v128_store(o + 8, vxyzw2);
-    wasm_v128_store(o + 12, vxyzw3);
-    o += 16;
-    n -= 4 * sizeof(uint32_t);
-  }
-  if XNN_UNLIKELY(n != 0) {
-    if (n & (2 * sizeof(uint32_t))) {
-      const double vx = *((const double*) x);
-      x += 2;
-      const double vy = *((const double*) y);
-      y += 2;
-      const double vz = *((const double*) z);
-      z += 2;
-      const double vw = *((const double*) w);
-      w += 2;
-
-      const v128_t vxy = wasm_f64x2_make(vx, vy);
-      const v128_t vzw = wasm_f64x2_make(vz, vw);
-
-      const v128_t vxyzw_lo = wasm_v32x4_shuffle(vxy, vzw, 0, 2, 4, 6);
-      const v128_t vxyzw_hi = wasm_v32x4_shuffle(vxy, vzw, 1, 3, 5, 7);
-
-      wasm_v128_store(o, vxyzw_lo);
-      wasm_v128_store(o + 4, vxyzw_hi);
-      o += 8;
-    }
-    if (n & (1 * sizeof(uint32_t))) {
-      const float vx = *x;
-      const float vy = *y;
-      const float vz = *z;
-      const float vw = *w;
-      o[0] = vx;
-      o[1] = vy;
-      o[2] = vz;
-      o[3] = vw;
-    }
-  }
-}
diff --git a/src/x32-zip/x32-zip-xm-neon.c b/src/x32-zip/x32-zip-xm-neon.c
deleted file mode 100644
index 13c56e6ced7..00000000000
--- a/src/x32-zip/x32-zip-xm-neon.c
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_xm_ukernel__neon(
-    size_t n,
-    size_t m,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-  assert(m >= 4);
-
-  const uint32_t* w = input;
-  const size_t group_increment = m * 4;
-  const size_t input_increment = n * 3;
-  const size_t output_increment = 16 - m * n;
-  const uint32_t* last_input = (const uint32_t*) ((uintptr_t) input + n * (m - 1));
-  uint32_t* last_output = (uint32_t*) ((uintptr_t) output + (m * 4 - 16));
-
-  for (size_t i = 0; i < m; i += 4) {
-    w = (const uint32_t*) ((uintptr_t) w + input_increment);
-    if (w >= last_input) {
-      w = last_input;
-    }
-    const uint32_t* z = (const uint32_t*) ((uintptr_t) w - n);
-    const uint32_t* y = (const uint32_t*) ((uintptr_t) z - n);
-    const uint32_t* x = (const uint32_t*) ((uintptr_t) y - n);
-
-    size_t k = n;
-    while (k >= 16) {
-      const uint32x4_t vx = vld1q_u32(x); x += 4;
-      const uint32x4_t vy = vld1q_u32(y); y += 4;
-      const uint32x4_t vz = vld1q_u32(z); z += 4;
-      const uint32x4_t vw = vld1q_u32(w); w += 4;
-
-      const uint32x4x2_t vxy = vzipq_u32(vx, vy);
-      const uint32x4x2_t vzw = vzipq_u32(vz, vw);
-
-      vst1_u32(output, vget_low_u32(vxy.val[0]));
-      vst1_u32(output + 2, vget_low_u32(vzw.val[0]));
-      output = (uint32_t*) ((uintptr_t) output + group_increment);
-
-      vst1_u32(output, vget_high_u32(vxy.val[0]));
-      vst1_u32(output + 2, vget_high_u32(vzw.val[0]));
-      output = (uint32_t*) ((uintptr_t) output + group_increment);
-
-      vst1_u32(output, vget_low_u32(vxy.val[1]));
-      vst1_u32(output + 2, vget_low_u32(vzw.val[1]));
-      output = (uint32_t*) ((uintptr_t) output + group_increment);
-
-      vst1_u32(output, vget_high_u32(vxy.val[1]));
-      vst1_u32(output + 2, vget_high_u32(vzw.val[1]));
-      output = (uint32_t*) ((uintptr_t) output + group_increment);
-
-      k -= 16;
-    }
-    if XNN_UNLIKELY(k != 0) {
-      if (k & 8) {
-        const uint32x2_t vx = vld1_u32(x); x += 2;
-        const uint32x2_t vy = vld1_u32(y); y += 2;
-        const uint32x2_t vz = vld1_u32(z); z += 2;
-        const uint32x2_t vw = vld1_u32(w); w += 2;
-
-        const uint32x2x2_t vxy = vzip_u32(vx, vy);
-        const uint32x2x2_t vzw = vzip_u32(vz, vw);
-
-        vst1_u32(output, vxy.val[0]);
-        vst1_u32(output + 2, vzw.val[0]);
-        output = (uint32_t*) ((uintptr_t) output + group_increment);
-
-        vst1_u32(output, vxy.val[1]);
-        vst1_u32(output + 2, vzw.val[1]);
-        output = (uint32_t*) ((uintptr_t) output + group_increment);
-      }
-      if (k & 4) {
-        const uint32x2_t vx = vld1_dup_u32(x);
-        const uint32x2_t vz = vld1_dup_u32(z);
-        const uint32x2_t vxy = vld1_lane_u32(y, vx, 1);
-        const uint32x2_t vzw = vld1_lane_u32(w, vz, 1); w += 1;
-
-        vst1_u32(output, vxy);
-        vst1_u32(output + 2, vzw);
-        output = (uint32_t*) ((uintptr_t) output + group_increment);
-      }
-    }
-    output = (uint32_t*) ((uintptr_t) output + output_increment);
-    if (output > last_output) {
-      output = last_output;
-    }
-  }
-}
diff --git a/src/x32-zip/x32-zip-xm-scalar.c b/src/x32-zip/x32-zip-xm-scalar.c
deleted file mode 100644
index 5d29999fa50..00000000000
--- a/src/x32-zip/x32-zip-xm-scalar.c
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_xm_ukernel__scalar(
-    size_t n,
-    size_t m,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-  assert(m >= 4);
-
-  size_t k = n;
-  do {
-    size_t l = m;
-    const uint32_t* input_column = input++;
-    do {
-      *output++ = *input_column;
-      input_column = (uint32_t*) ((uintptr_t) input_column + n);
-    } while (--l != 0);
-    k -= 4;
-  } while (k != 0);
-}
diff --git a/src/x32-zip/x32-zip-xm-sse2.c b/src/x32-zip/x32-zip-xm-sse2.c
deleted file mode 100644
index e5734c6732b..00000000000
--- a/src/x32-zip/x32-zip-xm-sse2.c
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_xm_ukernel__sse2(
-    size_t n,
-    size_t m,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % 4 == 0);
-  assert(m >= 4);
-
-  const uint32_t* w = input;
-  const size_t group_increment = m * 4;
-  const size_t input_increment = n * 3;
-  const size_t output_increment = 16 - m * n;
-  const uint32_t* last_input = (const uint32_t*) ((uintptr_t) input + n * (m - 1));
-  uint32_t* last_output = (uint32_t*) ((uintptr_t) output + (m * 4 - 16));
-
-  for (size_t i = 0; i < m; i += 4) {
-    w = (const uint32_t*) ((uintptr_t) w + input_increment);
-    if (w >= last_input) {
-      w = last_input;
-    }
-    const uint32_t* z = (const uint32_t*) ((uintptr_t) w - n);
-    const uint32_t* y = (const uint32_t*) ((uintptr_t) z - n);
-    const uint32_t* x = (const uint32_t*) ((uintptr_t) y - n);
-
-    size_t k = n;
-    while (k >= 16) {
-      const __m128i vx = _mm_loadu_si128((const __m128i*) x);
-      x += 4;
-      const __m128i vy = _mm_loadu_si128((const __m128i*) y);
-      y += 4;
-      const __m128i vz = _mm_loadu_si128((const __m128i*) z);
-      z += 4;
-      const __m128i vw = _mm_loadu_si128((const __m128i*) w);
-      w += 4;
-
-      const __m128i vxy_lo = _mm_unpacklo_epi32(vx, vy);
-      const __m128i vxy_hi = _mm_unpackhi_epi32(vx, vy);
-      const __m128i vzw_lo = _mm_unpacklo_epi32(vz, vw);
-      const __m128i vzw_hi = _mm_unpackhi_epi32(vz, vw);
-
-      const __m128i vxyzw0 = _mm_unpacklo_epi64(vxy_lo, vzw_lo);
-      const __m128i vxyzw1 = _mm_unpackhi_epi64(vxy_lo, vzw_lo);
-      const __m128i vxyzw2 = _mm_unpacklo_epi64(vxy_hi, vzw_hi);
-      const __m128i vxyzw3 = _mm_unpackhi_epi64(vxy_hi, vzw_hi);
-
-      _mm_storeu_si128((__m128i*) output, vxyzw0);
-      output = (uint32_t*) ((uintptr_t) output + group_increment);
-
-      _mm_storeu_si128((__m128i*) output, vxyzw1);
-      output = (uint32_t*) ((uintptr_t) output + group_increment);
-
-      _mm_storeu_si128((__m128i*) output, vxyzw2);
-      output = (uint32_t*) ((uintptr_t) output + group_increment);
-
-      _mm_storeu_si128((__m128i*) output, vxyzw3);
-      output = (uint32_t*) ((uintptr_t) output + group_increment);
-
-      k -= 16;
-    }
-    if XNN_UNLIKELY(k != 0) {
-      if (k & 8) {
-        const __m128i vx = _mm_loadl_epi64((const __m128i*) x);
-        x += 2;
-        const __m128i vy = _mm_loadl_epi64((const __m128i*) y);
-        y += 2;
-        const __m128i vz = _mm_loadl_epi64((const __m128i*) z);
-        z += 2;
-        const __m128i vw = _mm_loadl_epi64((const __m128i*) w);
-        w += 2;
-
-        const __m128i vxy = _mm_unpacklo_epi32(vx, vy);
-        const __m128i vzw = _mm_unpacklo_epi32(vz, vw);
-
-        const __m128i vxyzw_lo = _mm_unpacklo_epi64(vxy, vzw);
-        const __m128i vxyzw_hi = _mm_unpackhi_epi64(vxy, vzw);
-
-        _mm_storeu_si128((__m128i*) output, vxyzw_lo);
-        output = (uint32_t*) ((uintptr_t) output + group_increment);
-
-        _mm_storeu_si128((__m128i*) output, vxyzw_hi);
-        output = (uint32_t*) ((uintptr_t) output + group_increment);
-      }
-      if (k & 4) {
-        const uint32_t vx = *x;
-        const uint32_t vy = *y;
-        const uint32_t vz = *z;
-        const uint32_t vw = *w++;
-
-        output[0] = vx;
-        output[1] = vy;
-        output[2] = vz;
-        output[3] = vw;
-        output = (uint32_t*) ((uintptr_t) output + group_increment);
-      }
-    }
-    output = (uint32_t*) ((uintptr_t) output + output_increment);
-    if (output > last_output) {
-      output = last_output;
-    }
-  }
-}
diff --git a/src/x32-zip/x32-zip-xm-wasmsimd.c b/src/x32-zip/x32-zip-xm-wasmsimd.c
deleted file mode 100644
index 69d86ce317c..00000000000
--- a/src/x32-zip/x32-zip-xm-wasmsimd.c
+++ /dev/null
@@ -1,116 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x32_zip_xm_ukernel__wasmsimd(
-    size_t n,
-    size_t m,
-    const uint32_t* input,
-    uint32_t* output)
-{
-  assert(n != 0);
-  assert(n % sizeof(uint32_t) == 0);
-  assert(m >= 4);
-
-  const float* w = (const float*) input;
-  float* o = (float*) output;
-  const size_t group_increment = m * 4;
-  const size_t input_increment = n * 3;
-  const size_t output_increment = 4 * sizeof(uint32_t) - m * n;
-  const float* last_input = (const float*) ((uintptr_t) input + n * (m - 1));
-  float* last_output = (float*) ((uintptr_t) output + (m * 4 - 4 * sizeof(uint32_t)));
-
-  for (size_t i = 0; i < m; i += 4) {
-    w = (const float*) ((uintptr_t) w + input_increment);
-    if (w >= last_input) {
-      w = last_input;
-    }
-    const float* z = (const float*) ((uintptr_t) w - n);
-    const float* y = (const float*) ((uintptr_t) z - n);
-    const float* x = (const float*) ((uintptr_t) y - n);
-
-    size_t k = n;
-    while (k >= 4 * sizeof(uint32_t)) {
-      const v128_t vx = wasm_v128_load((const v128_t*) x);
-      x += 4;
-      const v128_t vy = wasm_v128_load((const v128_t*) y);
-      y += 4;
-      const v128_t vz = wasm_v128_load((const v128_t*) z);
-      z += 4;
-      const v128_t vw = wasm_v128_load((const v128_t*) w);
-      w += 4;
-
-      const v128_t vxy_lo = wasm_v32x4_shuffle(vx, vy, 0, 4, 1, 5);
-      const v128_t vxy_hi = wasm_v32x4_shuffle(vx, vy, 2, 6, 3, 7);
-      const v128_t vzw_lo = wasm_v32x4_shuffle(vz, vw, 0, 4, 1, 5);
-      const v128_t vzw_hi = wasm_v32x4_shuffle(vz, vw, 2, 6, 3, 7);
-
-      const v128_t vxyzw0 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 0, 1, 4, 5);
-      const v128_t vxyzw1 = wasm_v32x4_shuffle(vxy_lo, vzw_lo, 2, 3, 6, 7);
-      const v128_t vxyzw2 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 0, 1, 4, 5);
-      const v128_t vxyzw3 = wasm_v32x4_shuffle(vxy_hi, vzw_hi, 2, 3, 6, 7);
-
-      wasm_v128_store(o, vxyzw0);
-      o = (float*) ((uintptr_t) o + group_increment);
-
-      wasm_v128_store(o, vxyzw1);
-      o = (float*) ((uintptr_t) o + group_increment);
-
-      wasm_v128_store(o, vxyzw2);
-      o = (float*) ((uintptr_t) o + group_increment);
-
-      wasm_v128_store(o, vxyzw3);
-      o = (float*) ((uintptr_t) o + group_increment);
-
-      k -= 4 * sizeof(uint32_t);
-    }
-    if XNN_UNLIKELY(k != 0) {
-      if (k & (2 * sizeof(uint32_t))) {
-        const double vx = *((const double*) x);
-        x += 2;
-        const double vy = *((const double*) y);
-        y += 2;
-        const double vz = *((const double*) z);
-        z += 2;
-        const double vw = *((const double*) w);
-        w += 2;
-
-        const v128_t vxy = wasm_f64x2_make(vx, vy);
-        const v128_t vzw = wasm_f64x2_make(vz, vw);
-
-        const v128_t vxyzw_lo = wasm_v32x4_shuffle(vxy, vzw, 0, 2, 4, 6);
-        const v128_t vxyzw_hi = wasm_v32x4_shuffle(vxy, vzw, 1, 3, 5, 7);
-
-        wasm_v128_store(o, vxyzw_lo);
-        o = (float*) ((uintptr_t) o + group_increment);
-
-        wasm_v128_store(o, vxyzw_hi);
-        o = (float*) ((uintptr_t) o + group_increment);
-      }
-      if (k & (1 * sizeof(uint32_t))) {
-        const float vx = *x;
-        const float vy = *y;
-        const float vz = *z;
-        const float vw = *w++;
-
-        o[0] = vx;
-        o[1] = vy;
-        o[2] = vz;
-        o[3] = vw;
-        o = (float*) ((uintptr_t) o + group_increment);
-      }
-    }
-    o = (float*) ((uintptr_t) o + output_increment);
-    if (o > last_output) {
-      o = last_output;
-    }
-  }
-}
diff --git a/src/x8-zip/x8-zip-x2-neon.c b/src/x8-zip/x8-zip-x2-neon.c
deleted file mode 100644
index 19b9c97f663..00000000000
--- a/src/x8-zip/x8-zip-x2-neon.c
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <arm_neon.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_x2_ukernel__neon(
-    size_t n,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  const uint8_t* x = input;
-  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
-  uint8_t* o = output;
-
-  if (n >= 8) {
-    do {
-      uint8x8x2_t vxy;
-      vxy.val[0] = vld1_u8(x); x += 8;
-      vxy.val[1] = vld1_u8(y); y += 8;
-      vst2_u8(o, vxy); o += 16;;
-      n -= 8;
-    } while (n >= 8);
-    if (n != 0) {
-      const size_t address_increment = n - 8;
-      uint8x8x2_t vxy;
-      vxy.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment));
-      vxy.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment));
-      vst2_u8((uint8_t*) ((uintptr_t) o + address_increment * 2), vxy);
-    }
-  } else {
-    do {
-      const uint8_t vx = *x++;
-      const uint8_t vy = *y++;
-      o[0] = vx;
-      o[1] = vy;
-      o += 2;
-    } while (--n != 0);
-  }
-}
diff --git a/src/x8-zip/x8-zip-x2-scalar.c b/src/x8-zip/x8-zip-x2-scalar.c
deleted file mode 100644
index a0ffcd24ce4..00000000000
--- a/src/x8-zip/x8-zip-x2-scalar.c
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_x2_ukernel__scalar(
-    size_t n,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  assert(n != 0);
-
-  const uint8_t* x = input;
-  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
-  uint8_t* o = output;
-
-  do {
-    const uint8_t vx = *x++;
-    const uint8_t vy = *y++;
-    o[0] = vx;
-    o[1] = vy;
-    o += 2;
-
-    n -= sizeof(uint8_t);
-  } while (n != 0);
-}
diff --git a/src/x8-zip/x8-zip-x2-sse2.c b/src/x8-zip/x8-zip-x2-sse2.c
deleted file mode 100644
index 640832fa874..00000000000
--- a/src/x8-zip/x8-zip-x2-sse2.c
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <emmintrin.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_x2_ukernel__sse2(
-    size_t n,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  const uint8_t* x = input;
-  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
-  uint8_t* o = output;
-
-  if (n >= 16) {
-    do {
-      const __m128i vx = _mm_loadu_si128((const __m128i*) x);
-      x += 16;
-      const __m128i vy = _mm_loadu_si128((const __m128i*) y);
-      y += 16;
-      const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
-      const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
-      _mm_storeu_si128((__m128i*) o, vxy_lo);
-      _mm_storeu_si128((__m128i*) (o + 16), vxy_hi);
-      o = (void*) ((uintptr_t) o + 32);
-      n -= 16;
-    } while (n >= 16);
-    if (n != 0) {
-      const size_t address_increment = n - 16;
-      const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment));
-      const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment));
-      const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
-      const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
-      o = (void*) ((uintptr_t) o + address_increment * 2);
-      _mm_storeu_si128((__m128i*) o, vxy_lo);
-      _mm_storeu_si128((__m128i*) o + 1, vxy_hi);
-    }
-  } else {
-    do {
-      const uint8_t vx = *x++;
-      const uint8_t vy = *y++;
-      o[0] = vx;
-      o[1] = vy;
-      o += 2;
-    } while (--n != 0);
-  }
-}
diff --git a/src/x8-zip/x8-zip-x3-neon.c b/src/x8-zip/x8-zip-x3-neon.c
deleted file mode 100644
index 6e947ebdee2..00000000000
--- a/src/x8-zip/x8-zip-x3-neon.c
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <arm_neon.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_x3_ukernel__neon(
-    size_t n,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  const uint8_t* x = input;
-  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
-  const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
-  uint8_t* o = output;
-
-  if (n >= 8) {
-    do {
-      uint8x8x3_t vxyz;
-      vxyz.val[0] = vld1_u8(x); x += 8;
-      vxyz.val[1] = vld1_u8(y); y += 8;
-      vxyz.val[2] = vld1_u8(z); z += 8;
-      vst3_u8(o, vxyz); o += 24;
-      n -= 8;
-    } while (n >= 8);
-    if (n != 0) {
-      const size_t address_increment = n - 8;
-      uint8x8x3_t vxyz;
-      vxyz.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment));
-      vxyz.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment));
-      vxyz.val[2] = vld1_u8((const uint8_t*) ((uintptr_t) z + address_increment));
-      vst3_u8((uint8_t*) ((uintptr_t) o + address_increment * 3), vxyz);
-    }
-  } else {
-    do {
-      const uint8_t vx = *x++;
-      const uint8_t vy = *y++;
-      const uint8_t vz = *z++;
-      o[0] = vx;
-      o[1] = vy;
-      o[2] = vz;
-      o += 3;
-    } while (--n != 0);
-  }
-}
diff --git a/src/x8-zip/x8-zip-x3-scalar.c b/src/x8-zip/x8-zip-x3-scalar.c
deleted file mode 100644
index a5768086de6..00000000000
--- a/src/x8-zip/x8-zip-x3-scalar.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_x3_ukernel__scalar(
-    size_t n,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  const uint8_t* x = input;
-  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
-  const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
-  uint8_t* o = output;
-
-  do {
-    const uint8_t vx = *x++;
-    const uint8_t vy = *y++;
-    const uint8_t vz = *z++;
-    o[0] = vx;
-    o[1] = vy;
-    o[2] = vz;
-    o += 3;
-
-    n -= sizeof(uint8_t);
-  } while (n != 0);
-}
diff --git a/src/x8-zip/x8-zip-x3-sse2.c b/src/x8-zip/x8-zip-x3-sse2.c
deleted file mode 100644
index 4ac5cfd76ee..00000000000
--- a/src/x8-zip/x8-zip-x3-sse2.c
+++ /dev/null
@@ -1,137 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <emmintrin.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_x3_ukernel__sse2(
-    size_t n,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  const uint8_t* x = input;
-  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
-  const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
-  uint8_t* o = output;
-
-  if (n >= 16) {
-    const __m128i vmask0x00FF00FF = _mm_set1_epi16(0x00FF);
-    const __m128i vmask0x0000FFFF = _mm_set1_epi32(0x0000FFFF);
-    do {
-      // vx  = ( x15, x14, x13, x12, x11, x10,  x9,  x8,  x7,  x6,  x5,  x4, x3, x2, x1, x0 )
-      const __m128i vx = _mm_loadu_si128((const __m128i*) x);
-      x += 16;
-      // vy  = ( y15, y14, y13, y12, y11, y10,  y9,  y8,  y7,  y6,  y5,  y4, y3, y2, y1, y0 )
-      const __m128i vy = _mm_loadu_si128((const __m128i*) y);
-      y += 16;
-      // vz  = ( z15, z14, z13, z12, z11, z10,  z9,  z8,  z7,  z6,  z5,  z4, z3, z2, z1, z0 )
-      const __m128i vz = _mm_loadu_si128((const __m128i*) z);
-      z += 16;
-
-      // vxeye     = ( y14, x14, y12, x12, y10, x10,  y8,  x8,  y6,  x6,  y4,  x4,  y2,  x2,  y0,  x0 )
-      const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
-      // vyozo     = ( z15, y15, z13, y13, z11, y11,  z9,  y9,  z7,  y7,  z5,  y5,  z3,  y3,  z1,  y1 )
-      const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
-      // vzoxo     = ( x15, z14, x13, z12, x11, z10,  x9,  z8,  x7,  z6,  x5,  z4,  x3,  z2,  x1,  z0 )
-      const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
-
-      // vxeyezexo = ( x13, z12, y12, x12,  x9,  z8,  y8,  x8,  x5,  z4,  y4,  x4,  x1,  z0,  y0,  x0 )
-      const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
-      // vyozoxeye = ( y14, x14, z13, y13, y10, x10,  z9,  y9,  y6,  x6,  z5,  y5,  y2,  x2,  z1,  y1 )
-      const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
-      // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10,  z7,  y7,  x7,  z6,  z3,  y3,  x3,  z2 )
-      const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
-
-      // vtemp0    = ( x13, z12, y12, x12,  x5,  z4,  y4,  x4, z11, y11, x11, z10,  z3,  y3,  x3,  z2 )
-      const __m128i vtemp0 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
-      // vtemp1    = ( y10, x10,  z9,  y9,  y2,  x2,  z1,  y1,  x9,  z8,  y8,  x8,  x1,  z0,  y0,  x0 )
-      const __m128i vtemp1 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
-      // vtemp2    = ( z15, y15, x15, z14,  z7,  y7,  x7,  z6, y14, x14, z13, y13,  y6,  x6,  z5,  y5 )
-      const __m128i vtemp2 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
-
-      // vxyz0     = (  x5,  z4,  y4,  x4,  z3,  y3,  x3,  z2,  y2,  x2,  z1,  y1,  x1,  z0,  y0,  x0 )
-      const __m128i vxyz0 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
-      // vxyz1     = ( y10, x10,  z9,  y9,  x9,  z8,  y8,  x8,  z7,  y7,  x7,  z6,  y6,  x6,  z5,  y5 )
-      const __m128i vxyz1 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
-      // vxyz2     = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 )
-      const __m128i vxyz2 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
-
-      _mm_storeu_si128((__m128i*) o, vxyz0);
-      _mm_storeu_si128((__m128i*) o + 1, vxyz1);
-      _mm_storeu_si128((__m128i*) o + 2, vxyz2);
-      o += 48;
-      n -= 16;
-    } while (n >= 16);
-    if (n != 0) {
-      const size_t address_increment = n - 16;
-      // vx  = ( x15, x14, x13, x12, x11, x10,  x9,  x8,  x7,  x6,  x5,  x4, x3, x2, x1, x0 )
-      const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment));
-      // vy  = ( y15, y14, y13, y12, y11, y10,  y9,  y8,  y7,  y6,  y5,  y4, y3, y2, y1, y0 )
-      const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment));
-      // vz  = ( z15, z14, z13, z12, z11, z10,  z9,  z8,  z7,  z6,  z5,  z4, z3, z2, z1, z0 )
-      const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment));
-
-      // vxeye     = ( y14, x14, y12, x12, y10, x10,  y8,  x8,  y6,  x6,  y4,  x4,  y2,  x2,  y0,  x0 )
-      const __m128i vxeye = _mm_or_si128(_mm_and_si128(vx, vmask0x00FF00FF), _mm_slli_epi16(vy, 8));
-      // vyozo     = ( z15, y15, z13, y13, z11, y11,  z9,  y9,  z7,  y7,  z5,  y5,  z3,  y3,  z1,  y1 )
-      const __m128i vyozo = _mm_or_si128(_mm_andnot_si128(vmask0x00FF00FF, vz), _mm_srli_epi16(vy, 8));
-      // vzoxo     = ( x15, z14, x13, z12, x11, z10,  x9,  z8,  x7,  z6,  x5,  z4,  x3,  z2,  x1,  z0 )
-      const __m128i vzexo = _mm_or_si128(_mm_and_si128(vz, vmask0x00FF00FF), _mm_andnot_si128(vmask0x00FF00FF, vx));
-
-      // vxeyezexo = ( x13, z12, y12, x12,  x9,  z8,  y8,  x8,  x5,  z4,  y4,  x4,  x1,  z0,  y0,  x0 )
-      const __m128i vxeyezexo = _mm_or_si128(_mm_and_si128(vxeye, vmask0x0000FFFF), _mm_slli_epi32(vzexo, 16));
-      // vyozoxeye = ( y14, x14, z13, y13, y10, x10,  z9,  y9,  y6,  x6,  z5,  y5,  y2,  x2,  z1,  y1 )
-      const __m128i vyozoxeye = _mm_or_si128(_mm_and_si128(vyozo, vmask0x0000FFFF), _mm_andnot_si128(vmask0x0000FFFF, vxeye));
-      // vzexoyozo = ( z15, y15, x15, z14, z11, y11, x11, z10,  z7,  y7,  x7,  z6,  z3,  y3,  x3,  z2 )
-      const __m128i vzexoyozo = _mm_or_si128(_mm_andnot_si128(vmask0x0000FFFF, vyozo), _mm_srli_epi32(vzexo, 16));
-
-      // vtemp0    = ( x13, z12, y12, x12,  x5,  z4,  y4,  x4, z11, y11, x11, z10,  z3,  y3,  x3,  z2 )
-      const __m128i vtemp0 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vzexoyozo), _mm_castsi128_ps(vxeyezexo), _MM_SHUFFLE(3, 1, 2, 0)));
-      // vtemp1    = ( y10, x10,  z9,  y9,  y2,  x2,  z1,  y1,  x9,  z8,  y8,  x8,  x1,  z0,  y0,  x0 )
-      const __m128i vtemp1 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vxeyezexo), _mm_castsi128_ps(vyozoxeye), _MM_SHUFFLE(2, 0, 2, 0)));
-      // vtemp2    = ( z15, y15, x15, z14,  z7,  y7,  x7,  z6, y14, x14, z13, y13,  y6,  x6,  z5,  y5 )
-      const __m128i vtemp2 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vyozoxeye), _mm_castsi128_ps(vzexoyozo), _MM_SHUFFLE(3, 1, 3, 1)));
-
-      // vxyz0     = (  x5,  z4,  y4,  x4,  z3,  y3,  x3,  z2,  y2,  x2,  z1,  y1,  x1,  z0,  y0,  x0 )
-      const __m128i vxyz0 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vtemp1), _mm_castsi128_ps(vtemp0), _MM_SHUFFLE(2, 0, 2, 0)));
-      // vxyz1     = ( y10, x10,  z9,  y9,  x9,  z8,  y8,  x8,  z7,  y7,  x7,  z6,  y6,  x6,  z5,  y5 )
-      const __m128i vxyz1 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vtemp2), _mm_castsi128_ps(vtemp1), _MM_SHUFFLE(3, 1, 2, 0)));
-      // vxyz2     = ( z15, y15, x15, z14, y14, x14, z13, y13, x13, z12, y12, x12, z11, y11, x11, z10 )
-      const __m128i vxyz2 = _mm_castps_si128(
-      _mm_shuffle_ps(_mm_castsi128_ps(vtemp0), _mm_castsi128_ps(vtemp2), _MM_SHUFFLE(3, 1, 3, 1)));
-
-      o = (uint8_t*) ((uintptr_t) o + address_increment * 3);
-      _mm_storeu_si128((__m128i*) o, vxyz0);
-      _mm_storeu_si128((__m128i*) o + 1, vxyz1);
-      _mm_storeu_si128((__m128i*) o + 2, vxyz2);
-    }
-  } else {
-    do {
-      const uint8_t vx = *x++;
-      const uint8_t vy = *y++;
-      const uint8_t vz = *z++;
-      o[0] = vx;
-      o[1] = vy;
-      o[2] = vz;
-      o += 3;
-    } while (--n != 0);
-  }
-}
diff --git a/src/x8-zip/x8-zip-x4-neon.c b/src/x8-zip/x8-zip-x4-neon.c
deleted file mode 100644
index 158b325f653..00000000000
--- a/src/x8-zip/x8-zip-x4-neon.c
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <arm_neon.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_x4_ukernel__neon(
-    size_t n,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  const uint8_t* x = input;
-  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
-  const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
-  const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n);
-  uint8_t* o = output;
-
-  if (n >= 8) {
-    do {
-      uint8x8x4_t vxyzw;
-      vxyzw.val[0] = vld1_u8(x); x += 8;
-      vxyzw.val[1] = vld1_u8(y); y += 8;
-      vxyzw.val[2] = vld1_u8(z); z += 8;
-      vxyzw.val[3] = vld1_u8(w); w += 8;
-      vst4_u8(o, vxyzw); o += 32;
-      n -= 8;
-    } while (n >= 8);
-    if (n != 0) {
-      const size_t address_increment = n - 8;
-      uint8x8x4_t vxyzw;
-      vxyzw.val[0] = vld1_u8((const uint8_t*) ((uintptr_t) x + address_increment));
-      vxyzw.val[1] = vld1_u8((const uint8_t*) ((uintptr_t) y + address_increment));
-      vxyzw.val[2] = vld1_u8((const uint8_t*) ((uintptr_t) z + address_increment));
-      vxyzw.val[3] = vld1_u8((const uint8_t*) ((uintptr_t) w + address_increment));
-      vst4_u8((uint8_t*) ((uintptr_t) o + address_increment * 4), vxyzw);
-    }
-  } else {
-    do {
-      const uint8_t vx = *x++;
-      const uint8_t vy = *y++;
-      const uint8_t vz = *z++;
-      const uint8_t vw = *w++;
-      o[0] = vx;
-      o[1] = vy;
-      o[2] = vz;
-      o[3] = vw;
-      o += 4;
-    } while (--n != 0);
-  }
-}
diff --git a/src/x8-zip/x8-zip-x4-scalar.c b/src/x8-zip/x8-zip-x4-scalar.c
deleted file mode 100644
index bfce3071ab3..00000000000
--- a/src/x8-zip/x8-zip-x4-scalar.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_x4_ukernel__scalar(
-    size_t n,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  assert(n != 0);
-
-  const uint8_t* x = input;
-  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
-  const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
-  const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n);
-  uint8_t* o = output;
-
-  do {
-    const uint8_t vx = *x++;
-    const uint8_t vy = *y++;
-    const uint8_t vz = *z++;
-    const uint8_t vw = *w++;
-    o[0] = vx;
-    o[1] = vy;
-    o[2] = vz;
-    o[3] = vw;
-    o += 4;
-
-    n -= sizeof(uint8_t);
-  } while (n != 0);
-}
diff --git a/src/x8-zip/x8-zip-x4-sse2.c b/src/x8-zip/x8-zip-x4-sse2.c
deleted file mode 100644
index b00dfc8901e..00000000000
--- a/src/x8-zip/x8-zip-x4-sse2.c
+++ /dev/null
@@ -1,83 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <emmintrin.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_x4_ukernel__sse2(
-    size_t n,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  const uint8_t* x = input;
-  const uint8_t* y = (const uint8_t*) ((uintptr_t) x + n);
-  const uint8_t* z = (const uint8_t*) ((uintptr_t) y + n);
-  const uint8_t* w = (const uint8_t*) ((uintptr_t) z + n);
-  uint8_t* o = output;
-
-  if (n >= 16) {
-    do {
-      const __m128i vx = _mm_loadu_si128((const __m128i*) x);
-      x += 16;
-      const __m128i vy = _mm_loadu_si128((const __m128i*) y);
-      y += 16;
-      const __m128i vz = _mm_loadu_si128((const __m128i*) z);
-      z += 16;
-      const __m128i vw = _mm_loadu_si128((const __m128i*) w);
-      w += 16;
-      const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
-      const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
-      const __m128i vzw_lo = _mm_unpacklo_epi8(vz, vw);
-      const __m128i vzw_hi = _mm_unpackhi_epi8(vz, vw);
-      const __m128i vxyzw0 = _mm_unpacklo_epi16(vxy_lo, vzw_lo);
-      const __m128i vxyzw1 = _mm_unpackhi_epi16(vxy_lo, vzw_lo);
-      const __m128i vxyzw2 = _mm_unpacklo_epi16(vxy_hi, vzw_hi);
-      const __m128i vxyzw3 = _mm_unpackhi_epi16(vxy_hi, vzw_hi);
-      _mm_storeu_si128((__m128i*) o, vxyzw0);
-      _mm_storeu_si128((__m128i*) o + 1, vxyzw1);
-      _mm_storeu_si128((__m128i*) o + 2, vxyzw2);
-      _mm_storeu_si128((__m128i*) o + 3, vxyzw3);
-      o = (void*) ((uintptr_t) o + 64);
-      n -= 16;
-    } while (n >= 16);
-    if (n != 0) {
-      const size_t address_increment = n - 16;
-      const __m128i vx = _mm_loadu_si128((const __m128i*) ((uintptr_t) x + address_increment));
-      const __m128i vy = _mm_loadu_si128((const __m128i*) ((uintptr_t) y + address_increment));
-      const __m128i vz = _mm_loadu_si128((const __m128i*) ((uintptr_t) z + address_increment));
-      const __m128i vw = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + address_increment));
-      const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
-      const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
-      const __m128i vzw_lo = _mm_unpacklo_epi8(vz, vw);
-      const __m128i vzw_hi = _mm_unpackhi_epi8(vz, vw);
-      const __m128i vxyzw0 = _mm_unpacklo_epi16(vxy_lo, vzw_lo);
-      const __m128i vxyzw1 = _mm_unpackhi_epi16(vxy_lo, vzw_lo);
-      const __m128i vxyzw2 = _mm_unpacklo_epi16(vxy_hi, vzw_hi);
-      const __m128i vxyzw3 = _mm_unpackhi_epi16(vxy_hi, vzw_hi);
-      o = (void*) ((uintptr_t) o + address_increment * 4);
-      _mm_storeu_si128((__m128i*) o, vxyzw0);
-      _mm_storeu_si128((__m128i*) o + 1, vxyzw1);
-      _mm_storeu_si128((__m128i*) o + 2, vxyzw2);
-      _mm_storeu_si128((__m128i*) o + 3, vxyzw3);
-    }
-  } else {
-    do {
-      const uint8_t vx = *x++;
-      const uint8_t vy = *y++;
-      const uint8_t vz = *z++;
-      const uint8_t vw = *w++;
-      o[0] = vx;
-      o[1] = vy;
-      o[2] = vz;
-      o[3] = vw;
-      o += 4;
-    } while (--n != 0);
-  }
-}
diff --git a/src/x8-zip/x8-zip-xm-neon.c b/src/x8-zip/x8-zip-xm-neon.c
deleted file mode 100644
index 7839d0826d8..00000000000
--- a/src/x8-zip/x8-zip-xm-neon.c
+++ /dev/null
@@ -1,144 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <arm_neon.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_xm_ukernel__neon(
-    size_t n,
-    size_t m,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  const uint8_t* w = input;
-  const size_t input_increment = n * 3;
-  const size_t output_increment = 4 - m * n;
-  const uint8_t* last_input = w + n * (m - 1);
-  uint8_t* last_output = (uint8_t*) ((uintptr_t) output + (m - 4));
-
-  if (n >= 8) {
-    for (size_t i = 0; i < m; i += 4) {
-      size_t k = n;
-      w = (const uint8_t*) ((uintptr_t) w + input_increment);
-      if (w >= last_input) {
-        w = last_input;
-      }
-      const uint8_t* z = (const uint8_t*) ((uintptr_t) w - n);
-      const uint8_t* y = (const uint8_t*) ((uintptr_t) z - n);
-      const uint8_t* x = (const uint8_t*) ((uintptr_t) y - n);
-      while (k >= 8) {
-        const uint8x8_t vx = vld1_u8(x); x += 8;
-        const uint8x8_t vy = vld1_u8(y); y += 8;
-        const uint8x8_t vz = vld1_u8(z); z += 8;
-        const uint8x8_t vw = vld1_u8(w); w += 8;
-
-        const uint8x8x2_t vxy = vzip_u8(vx, vy);
-        const uint8x8x2_t vzw = vzip_u8(vz, vw);
-        const uint16x4x2_t vxyzw_lo = vzip_u16(vreinterpret_u16_u8(vxy.val[0]), vreinterpret_u16_u8(vzw.val[0]));
-        const uint16x4x2_t vxyzw_hi = vzip_u16(vreinterpret_u16_u8(vxy.val[1]), vreinterpret_u16_u8(vzw.val[1]));
-
-        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[0]), 0);
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[0]), 1);
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[1]), 0);
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_lo.val[1]), 1);
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[0]), 0);
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[0]), 1);
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[1]), 0);
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        vst1_lane_u32((void*) output, vreinterpret_u32_u16(vxyzw_hi.val[1]), 1);
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        k -= 8;
-      }
-      if (k != 0) {
-        const size_t address_increment = k - 8;
-        x = (const uint8_t*) ((uintptr_t) x + address_increment);
-        y = (const uint8_t*) ((uintptr_t) y + address_increment);
-        z = (const uint8_t*) ((uintptr_t) z + address_increment);
-        w = (const uint8_t*) ((uintptr_t) w + address_increment);
-        const int64x1_t vshift = vmov_n_s64(8 * address_increment);
-
-        const uint64x1_t vx = vshl_u64(vreinterpret_u64_u8(vld1_u8(x)), vshift);
-        const uint64x1_t vy = vshl_u64(vreinterpret_u64_u8(vld1_u8(y)), vshift);
-        const uint64x1_t vz = vshl_u64(vreinterpret_u64_u8(vld1_u8(z)), vshift);
-        const uint64x1_t vw = vshl_u64(vreinterpret_u64_u8(vld1_u8(w)), vshift); w += 8;
-        const uint8x8x2_t vxy = vzip_u8(vreinterpret_u8_u64(vx), vreinterpret_u8_u64(vy));
-        const uint8x8x2_t vzw = vzip_u8(vreinterpret_u8_u64(vz), vreinterpret_u8_u64(vw));
-        const uint16x4x2_t vxyzw_lo = vzip_u16(vreinterpret_u16_u8(vxy.val[0]), vreinterpret_u16_u8(vzw.val[0]));
-        const uint16x4x2_t vxyzw_hi = vzip_u16(vreinterpret_u16_u8(vxy.val[1]), vreinterpret_u16_u8(vzw.val[1]));
-
-        uint32x2_t vxyzw0 = vreinterpret_u32_u16(vxyzw_lo.val[0]);
-        uint32x2_t vxyzw1 = vreinterpret_u32_u16(vxyzw_lo.val[1]);
-        uint32x2_t vxyzw2 = vreinterpret_u32_u16(vxyzw_hi.val[0]);
-        uint32x2_t vxyzw3 = vreinterpret_u32_u16(vxyzw_hi.val[1]);
-
-        if (k & 4) {
-          vst1_lane_u32((void*) output, vxyzw0, 0);
-          output = (uint8_t*) ((uintptr_t) output + m);
-
-          vst1_lane_u32((void*) output, vxyzw0, 1);
-          output = (uint8_t*) ((uintptr_t) output + m);
-
-          vst1_lane_u32((void*) output, vxyzw1, 0);
-          output = (uint8_t*) ((uintptr_t) output + m);
-
-          vst1_lane_u32((void*) output, vxyzw1, 1);
-          output = (uint8_t*) ((uintptr_t) output + m);
-
-          vxyzw0 = vxyzw2;
-          vxyzw1 = vxyzw3;
-        }
-
-        if (k & 2) {
-          vst1_lane_u32((void*) output, vxyzw0, 0);
-          output = (uint8_t*) ((uintptr_t) output + m);
-
-          vst1_lane_u32((void*) output, vxyzw0, 1);
-          output = (uint8_t*) ((uintptr_t) output + m);
-
-          vxyzw0 = vxyzw1;
-        }
-        if (k & 1) {
-          vst1_lane_u32((void*) output, vxyzw0, 0);
-          output = (uint8_t*) ((uintptr_t) output + m);
-        }
-      }
-      output = (uint8_t*) ((uintptr_t) output + output_increment);
-      if (output > last_output) {
-        output = last_output;
-      }
-    }
-  } else {
-    const uint8_t* i = input;
-    uint8_t* o = output;
-    size_t k = n;
-    do {
-      size_t l = m;
-      const uint8_t* ii = i++;
-      do {
-        *o++ = *ii;
-        ii += n;
-      } while (--l != 0);
-    } while (--k != 0);
-  }
-}
diff --git a/src/x8-zip/x8-zip-xm-scalar.c b/src/x8-zip/x8-zip-xm-scalar.c
deleted file mode 100644
index 4d2a4e55333..00000000000
--- a/src/x8-zip/x8-zip-xm-scalar.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include "xnnpack/zip.h"
-
-
-void xnn_x8_zip_xm_ukernel__scalar(
-    size_t n,
-    size_t m,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  assert(n != 0);
-  assert(m >= 4);
-
-  size_t k = n;
-  do {
-    size_t l = m;
-    const uint8_t* input_column = input++;
-    do {
-      *output++ = *input_column;
-      input_column = (uint8_t*) ((uintptr_t) input_column + n);
-    } while (--l != 0);
-    k -= sizeof(uint8_t);
-  } while (k != 0);
-}
diff --git a/src/x8-zip/x8-zip-xm-sse2.c b/src/x8-zip/x8-zip-xm-sse2.c
deleted file mode 100644
index 1309639dd59..00000000000
--- a/src/x8-zip/x8-zip-xm-sse2.c
+++ /dev/null
@@ -1,207 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <emmintrin.h>
-
-#include "xnnpack/zip.h"
-#include "xnnpack/unaligned.h"
-
-
-void xnn_x8_zip_xm_ukernel__sse2(
-    size_t n,
-    size_t m,
-    const uint8_t* input,
-    uint8_t* output)
-{
-  const uint8_t* w = input;
-  const size_t input_increment = n * 3;
-  const size_t output_increment = 4 - m * n;
-  const uint8_t* last_input = w + n * (m - 1);
-  uint8_t* last_output = (uint8_t*) ((uintptr_t) output + (m - 4));
-
-  if (n >= 8) {
-    for (size_t i = 0; i < m; i += 4) {
-      size_t k = n;
-      w = (const uint8_t*) ((uintptr_t) w + input_increment);
-      if (w >= last_input) {
-        w = last_input;
-      }
-      const uint8_t* z = (const uint8_t*) ((uintptr_t) w - n);
-      const uint8_t* y = (const uint8_t*) ((uintptr_t) z - n);
-      const uint8_t* x = (const uint8_t*) ((uintptr_t) y - n);
-      while (k >= 16) {
-        const __m128i vx = _mm_loadu_si128((const __m128i*) x);
-        x += 16;
-        const __m128i vy = _mm_loadu_si128((const __m128i*) y);
-        y += 16;
-        const __m128i vz = _mm_loadu_si128((const __m128i*) z);
-        z += 16;
-        const __m128i vw = _mm_loadu_si128((const __m128i*) w);
-        w += 16;
-        const __m128i vxy_lo = _mm_unpacklo_epi8(vx, vy);
-        const __m128i vxy_hi = _mm_unpackhi_epi8(vx, vy);
-        const __m128i vzw_lo = _mm_unpacklo_epi8(vz, vw);
-        const __m128i vzw_hi = _mm_unpackhi_epi8(vz, vw);
-        __m128i vxyzw0 = _mm_unpacklo_epi16(vxy_lo, vzw_lo);
-        __m128i vxyzw1 = _mm_unpackhi_epi16(vxy_lo, vzw_lo);
-        __m128i vxyzw2 = _mm_unpacklo_epi16(vxy_hi, vzw_hi);
-        __m128i vxyzw3 = _mm_unpackhi_epi16(vxy_hi, vzw_hi);
-
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw1 = _mm_unpackhi_epi64(vxyzw1, vxyzw1);
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw2 = _mm_shufflelo_epi16(vxyzw2, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw2 = _mm_unpackhi_epi64(vxyzw2, vxyzw2);
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw2 = _mm_shufflelo_epi16(vxyzw2, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw2));
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw3 = _mm_shufflelo_epi16(vxyzw3, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw3 = _mm_unpackhi_epi64(vxyzw3, vxyzw3);
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw3 = _mm_shufflelo_epi16(vxyzw3, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw3));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        k -= 16;
-      };
-      if (k >= 8) {
-        const __m128i vx = _mm_loadl_epi64((const __m128i*) x);
-        x += 8;
-        const __m128i vy = _mm_loadl_epi64((const __m128i*) y);
-        y += 8;
-        const __m128i vz = _mm_loadl_epi64((const __m128i*) z);
-        z += 8;
-        const __m128i vw = _mm_loadl_epi64((const __m128i*) w);
-        w += 8;
-        const __m128i vxy = _mm_unpacklo_epi8(vx, vy);
-        const __m128i vzw = _mm_unpacklo_epi8(vz, vw);
-        __m128i vxyzw0 = _mm_unpacklo_epi16(vxy, vzw);
-        __m128i vxyzw1 = _mm_unpackhi_epi16(vxy, vzw);
-
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-        output = (uint8_t*) ((uintptr_t) output + m);
-
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw1 = _mm_unpackhi_epi64(vxyzw1, vxyzw1);
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        vxyzw1 = _mm_shufflelo_epi16(vxyzw1, _MM_SHUFFLE(3, 2, 3, 2));
-        unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw1));
-        output = (uint8_t*) ((uintptr_t) output + m);
-        k -= 8;
-      }
-      if (k != 0) {
-        const size_t address_decrement = 8 - k;
-        x -= address_decrement;
-        y -= address_decrement;
-        z -= address_decrement;
-        w -= address_decrement;
-        const __m128i vshift = _mm_cvtsi32_si128((int) address_decrement * 8);
-
-        const __m128i vx = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) x), vshift);
-        const __m128i vy = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) y), vshift);
-        const __m128i vz = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) z), vshift);
-        const __m128i vw = _mm_srl_epi64(_mm_loadl_epi64((const __m128i*) w), vshift);
-        w += 8;
-        const __m128i vxy = _mm_unpacklo_epi8(vx, vy);
-        const __m128i vzw = _mm_unpacklo_epi8(vz, vw);
-        __m128i vxyzw0 = _mm_unpacklo_epi16(vxy, vzw);
-        __m128i vxyzw1 = _mm_unpackhi_epi16(vxy, vzw);
-
-        if (k & 4) {
-          unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-          output = (uint8_t*) ((uintptr_t) output + m);
-          vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
-          unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-          output = (uint8_t*) ((uintptr_t) output + m);
-          vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
-          unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-          output = (uint8_t*) ((uintptr_t) output + m);
-          vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
-          unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-          output = (uint8_t*) ((uintptr_t) output + m);
-          vxyzw0 = vxyzw1;
-        }
-
-        if (k & 2) {
-          unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-          output = (uint8_t*) ((uintptr_t) output + m);
-          vxyzw0 = _mm_shufflelo_epi16(vxyzw0, _MM_SHUFFLE(3, 2, 3, 2));
-          unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-          output = (uint8_t*) ((uintptr_t) output + m);
-          vxyzw0 = _mm_unpackhi_epi64(vxyzw0, vxyzw0);
-        }
-        if (k & 1) {
-          unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vxyzw0));
-          output = (uint8_t*) ((uintptr_t) output + m);
-        }
-      }
-      output = (uint8_t*) ((uintptr_t) output + output_increment);
-      if (output > last_output) {
-        output = last_output;
-      }
-    }
-  } else {
-    const uint8_t* i = input;
-    uint8_t* o = output;
-    size_t k = n;
-    do {
-      size_t l = m;
-      const uint8_t* ii = i++;
-      do {
-        *o++ = *ii;
-        ii += n;
-      } while (--l != 0);
-    } while (--k != 0);
-  }
-}
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
index e5f819db0b0..7ee797f9c5e 100644
--- a/src/xnnpack/compute.h
+++ b/src/xnnpack/compute.h
@@ -1198,29 +1198,6 @@ struct elementwise_binary_context {
       size_t i, size_t j, size_t k, size_t l, size_t m);
 #endif
 
-struct channel_shuffle_context {
-  const void* x;
-  size_t x_stride;
-  void* y;
-  size_t y_stride;
-  size_t n;
-  size_t m;
-  union {
-    xnn_zipc_ukernel_fn fixed_ukernel;
-    xnn_zipv_ukernel_fn variable_ukernel;
-  };
-};
-
-#ifndef __cplusplus
-  XNN_PRIVATE void xnn_compute_channel_shuffle_fixed(
-      const struct channel_shuffle_context context[restrict XNN_MIN_ELEMENTS(1)],
-      size_t index);
-
-  XNN_PRIVATE void xnn_compute_channel_shuffle_variable(
-      const struct channel_shuffle_context context[restrict XNN_MIN_ELEMENTS(1)],
-      size_t index);
-#endif
-
 struct lut_strided_context {
   size_t n;
   const void* x;
diff --git a/src/xnnpack/config.h b/src/xnnpack/config.h
index 3ff8c61441a..0fd3d1bf933 100644
--- a/src/xnnpack/config.h
+++ b/src/xnnpack/config.h
@@ -267,9 +267,6 @@ XNN_INTERNAL const struct xnn_maxpool_config* xnn_init_f32_maxpool_config();
 XNN_INTERNAL const struct xnn_maxpool_config* xnn_init_s8_maxpool_config();
 XNN_INTERNAL const struct xnn_maxpool_config* xnn_init_u8_maxpool_config();
 
-XNN_INTERNAL const struct xnn_zip_config* xnn_init_x8_zip_config();
-XNN_INTERNAL const struct xnn_zip_config* xnn_init_x32_zip_config();
-
 XNN_INTERNAL const struct xnn_rmax_config* xnn_init_f16_rmax_config();
 XNN_INTERNAL const struct xnn_rmax_config* xnn_init_f32_rmax_config();
 XNN_INTERNAL const struct xnn_rmax_config* xnn_init_u8_rmax_config();
diff --git a/src/xnnpack/operator-type-defs.h b/src/xnnpack/operator-type-defs.h
index 623ef6da653..96aecb7bfc1 100644
--- a/src/xnnpack/operator-type-defs.h
+++ b/src/xnnpack/operator-type-defs.h
@@ -18,8 +18,6 @@ XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_f32, "Batch Matrix Mult
 XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_qd8_f32_qc8w, "Batch Matrix Multiply (NC, QD8, F32, QC8W)")
 XNN_ENUM_ITEM(xnn_operator_type_batch_matrix_multiply_nc_qdu8_f32_qc8w, "Batch Matrix Multiply (NC, QDU8, F32, QC8W)")
 XNN_ENUM_ITEM(xnn_operator_type_binary_elementwise, "Binary Elementwise (ND)")
-XNN_ENUM_ITEM(xnn_operator_type_channel_shuffle_nc_x8, "Channel Shuffle (NC, X8)")
-XNN_ENUM_ITEM(xnn_operator_type_channel_shuffle_nc_x32, "Channel Shuffle (NC, X32)")
 XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x8, "Constant Pad (ND, X8)")
 XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x16, "Constant Pad (ND, X16)")
 XNN_ENUM_ITEM(xnn_operator_type_constant_pad_nd_x32, "Constant Pad (ND, X32)")
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index 6d88baa4919..c43a5107a9e 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -331,7 +331,6 @@ struct xnn_operator {
   union {
     struct argmax_pooling_context argmax_pooling;
     struct average_pooling_context average_pooling;
-    struct channel_shuffle_context channel_shuffle;
     struct conv2d_context conv2d;
     struct dwconv2d_context dwconv2d;
     struct {
diff --git a/test/BUILD.bazel b/test/BUILD.bazel
index d67eaa6dc01..c1f21d2fa8b 100644
--- a/test/BUILD.bazel
+++ b/test/BUILD.bazel
@@ -1084,15 +1084,6 @@ xnnpack_unit_test(
     deps = MICROKERNEL_TEST_DEPS,
 )
 
-xnnpack_unit_test(
-    name = "x8_zip_test",
-    srcs = [
-        "x8-zip.cc",
-        "zip-microkernel-tester.h",
-    ],
-    deps = MICROKERNEL_TEST_DEPS,
-)
-
 xnnpack_unit_test(
     name = "x32_packb_test",
     srcs = [
@@ -1151,15 +1142,6 @@ xnnpack_unit_test(
     deps = MICROKERNEL_TEST_DEPS,
 )
 
-xnnpack_unit_test(
-    name = "x32_zip_test",
-    srcs = [
-        "x32-zip.cc",
-        "zip-microkernel-tester.h",
-    ],
-    deps = MICROKERNEL_TEST_DEPS,
-)
-
 xnnpack_unit_test(
     name = "xx_fill_test",
     srcs = ["xx-fill.cc"],
@@ -1237,15 +1219,6 @@ xnnpack_unit_test(
     deps = OPERATOR_TEST_DEPS,
 )
 
-xnnpack_unit_test(
-    name = "channel_shuffle_nc_test",
-    srcs = [
-        "channel-shuffle-nc.cc",
-        "channel-shuffle-operator-tester.h",
-    ],
-    deps = OPERATOR_TEST_DEPS,
-)
-
 xnnpack_unit_test(
     name = "constant_pad_nd_test",
     srcs = [
diff --git a/test/channel-shuffle-nc.cc b/test/channel-shuffle-nc.cc
deleted file mode 100644
index b84e9f993d4..00000000000
--- a/test/channel-shuffle-nc.cc
+++ /dev/null
@@ -1,504 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <cstddef>
-
-#include <gtest/gtest.h>
-#include "channel-shuffle-operator-tester.h"
-
-TEST(CHANNEL_SHUFFLE_NC_X8, two_groups_unit_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(1)
-      .groups(2)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, three_groups_unit_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(1)
-      .groups(3)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, four_groups_unit_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(1)
-      .groups(4)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, many_groups_unit_batch) {
-  for (size_t groups = 5; groups < 12; groups += 3) {
-    for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-      ChannelShuffleOperatorTester()
-        .batch_size(1)
-        .groups(groups)
-        .group_channels(group_channels)
-        .iterations(3)
-        .TestX8();
-    }
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, two_groups_small_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(2)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, three_groups_small_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(3)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, four_groups_small_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(4)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, many_groups_small_batch) {
-  for (size_t groups = 5; groups < 12; groups += 3) {
-    for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-      ChannelShuffleOperatorTester()
-        .batch_size(3)
-        .groups(groups)
-        .group_channels(group_channels)
-        .iterations(3)
-        .TestX8();
-    }
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, two_groups_small_batch_with_input_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(2)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, three_groups_small_batch_with_input_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(3)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, four_groups_small_batch_with_input_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(4)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, many_groups_small_batch_with_input_stride) {
-  for (size_t groups = 5; groups < 12; groups += 3) {
-    for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-      ChannelShuffleOperatorTester()
-        .batch_size(3)
-        .groups(groups)
-        .group_channels(group_channels)
-        .input_stride(1007)
-        .iterations(3)
-        .TestX8();
-    }
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, two_groups_small_batch_with_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(2)
-      .group_channels(group_channels)
-      .output_stride(513)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, three_groups_small_batch_with_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(3)
-      .group_channels(group_channels)
-      .output_stride(513)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, four_groups_small_batch_with_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(4)
-      .group_channels(group_channels)
-      .output_stride(513)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, many_groups_small_batch_with_output_stride) {
-  for (size_t groups = 5; groups < 12; groups += 3) {
-    for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-      ChannelShuffleOperatorTester()
-        .batch_size(3)
-        .groups(groups)
-        .group_channels(group_channels)
-        .output_stride(1111)
-        .iterations(3)
-        .TestX8();
-    }
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, two_groups_small_batch_with_input_and_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(2)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .output_stride(513)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, three_groups_small_batch_with_input_and_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(3)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .output_stride(513)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, four_groups_small_batch_with_input_and_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(4)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .output_stride(513)
-      .iterations(3)
-      .TestX8();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X8, many_groups_small_batch_with_input_and_output_stride) {
-  for (size_t groups = 5; groups < 12; groups += 3) {
-    for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-      ChannelShuffleOperatorTester()
-        .batch_size(3)
-        .groups(groups)
-        .group_channels(group_channels)
-        .input_stride(1007)
-        .output_stride(1111)
-        .iterations(3)
-        .TestX8();
-    }
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, two_groups_unit_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(1)
-      .groups(2)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, three_groups_unit_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(1)
-      .groups(3)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, four_groups_unit_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(1)
-      .groups(4)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, many_groups_unit_batch) {
-  for (size_t groups = 5; groups < 12; groups += 3) {
-    for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-      ChannelShuffleOperatorTester()
-        .batch_size(1)
-        .groups(groups)
-        .group_channels(group_channels)
-        .iterations(3)
-        .TestX32();
-    }
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, two_groups_small_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(2)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, three_groups_small_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(3)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, four_groups_small_batch) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(4)
-      .group_channels(group_channels)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, many_groups_small_batch) {
-  for (size_t groups = 5; groups < 12; groups += 3) {
-    for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-      ChannelShuffleOperatorTester()
-        .batch_size(3)
-        .groups(groups)
-        .group_channels(group_channels)
-        .iterations(3)
-        .TestX32();
-    }
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, two_groups_small_batch_with_input_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(2)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, three_groups_small_batch_with_input_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(3)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, four_groups_small_batch_with_input_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(4)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, many_groups_small_batch_with_input_stride) {
-  for (size_t groups = 5; groups < 12; groups += 3) {
-    for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-      ChannelShuffleOperatorTester()
-        .batch_size(3)
-        .groups(groups)
-        .group_channels(group_channels)
-        .input_stride(1007)
-        .iterations(3)
-        .TestX32();
-    }
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, two_groups_small_batch_with_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(2)
-      .group_channels(group_channels)
-      .output_stride(513)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, three_groups_small_batch_with_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(3)
-      .group_channels(group_channels)
-      .output_stride(513)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, four_groups_small_batch_with_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(4)
-      .group_channels(group_channels)
-      .output_stride(513)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, many_groups_small_batch_with_output_stride) {
-  for (size_t groups = 5; groups < 12; groups += 3) {
-    for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-      ChannelShuffleOperatorTester()
-        .batch_size(3)
-        .groups(groups)
-        .group_channels(group_channels)
-        .output_stride(1111)
-        .iterations(3)
-        .TestX32();
-    }
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, two_groups_small_batch_with_input_and_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(2)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .output_stride(513)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, three_groups_small_batch_with_input_and_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(3)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .output_stride(513)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, four_groups_small_batch_with_input_and_output_stride) {
-  for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-    ChannelShuffleOperatorTester()
-      .batch_size(3)
-      .groups(4)
-      .group_channels(group_channels)
-      .input_stride(511)
-      .output_stride(513)
-      .iterations(3)
-      .TestX32();
-  }
-}
-
-TEST(CHANNEL_SHUFFLE_NC_X32, many_groups_small_batch_with_input_and_output_stride) {
-  for (size_t groups = 5; groups < 12; groups += 3) {
-    for (size_t group_channels = 1; group_channels < 100; group_channels += 15) {
-      ChannelShuffleOperatorTester()
-        .batch_size(3)
-        .groups(groups)
-        .group_channels(group_channels)
-        .input_stride(1007)
-        .output_stride(1111)
-        .iterations(3)
-        .TestX32();
-    }
-  }
-}
diff --git a/test/channel-shuffle-operator-tester.h b/test/channel-shuffle-operator-tester.h
deleted file mode 100644
index e3ac7117df8..00000000000
--- a/test/channel-shuffle-operator-tester.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#pragma once
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <limits>
-#include <memory>
-#include <random>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "xnnpack.h"
-#include "xnnpack/buffer.h"
-#include "replicable_random_device.h"
-
-class ChannelShuffleOperatorTester {
- public:
-  ChannelShuffleOperatorTester& groups(size_t groups) {
-    assert(groups != 0);
-    this->groups_ = groups;
-    return *this;
-  }
-
-  size_t groups() const {
-    return this->groups_;
-  }
-
-  ChannelShuffleOperatorTester& group_channels(size_t group_channels) {
-    assert(group_channels != 0);
-    this->group_channels_ = group_channels;
-    return *this;
-  }
-
-  size_t group_channels() const {
-    return this->group_channels_;
-  }
-
-  size_t channels() const {
-    return groups() * group_channels();
-  }
-
-  ChannelShuffleOperatorTester& input_stride(size_t input_stride) {
-    assert(input_stride != 0);
-    this->input_stride_ = input_stride;
-    return *this;
-  }
-
-  size_t input_stride() const {
-    if (this->input_stride_ == 0) {
-      return channels();
-    } else {
-      assert(this->input_stride_ >= channels());
-      return this->input_stride_;
-    }
-  }
-
-  ChannelShuffleOperatorTester& output_stride(size_t output_stride) {
-    assert(output_stride != 0);
-    this->output_stride_ = output_stride;
-    return *this;
-  }
-
-  size_t output_stride() const {
-    if (this->output_stride_ == 0) {
-      return channels();
-    } else {
-      assert(this->output_stride_ >= channels());
-      return this->output_stride_;
-    }
-  }
-
-  ChannelShuffleOperatorTester& batch_size(size_t batch_size) {
-    assert(batch_size != 0);
-    this->batch_size_ = batch_size;
-    return *this;
-  }
-
-  size_t batch_size() const {
-    return this->batch_size_;
-  }
-
-  ChannelShuffleOperatorTester& iterations(size_t iterations) {
-    this->iterations_ = iterations;
-    return *this;
-  }
-
-  size_t iterations() const {
-    return this->iterations_;
-  }
-
-  void TestX8() const {
-    xnnpack::ReplicableRandomDevice rng;
-    std::uniform_int_distribution<int32_t> u8dist(
-      std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max());
-
-    xnnpack::Buffer<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + (batch_size() - 1) * input_stride() + channels());
-    xnnpack::Buffer<uint8_t> output((batch_size() - 1) * output_stride() + channels());
-    for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(input.begin(), input.end(), [&]() { return u8dist(rng); });
-
-      // Create, setup, run, and destroy Channel Shuffle operator.
-      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
-      xnn_operator_t channel_shuffle_op = nullptr;
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_create_channel_shuffle_nc_x8(
-          groups(), group_channels(),
-          input_stride(), output_stride(),
-          0, &channel_shuffle_op));
-      ASSERT_NE(nullptr, channel_shuffle_op);
-
-      // Smart pointer to automatically delete channel_shuffle_op.
-      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_channel_shuffle_op(channel_shuffle_op, xnn_delete_operator);
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_reshape_channel_shuffle_nc_x8(
-          channel_shuffle_op,
-          batch_size(),
-          /*threadpool=*/nullptr));
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_setup_channel_shuffle_nc_x8(
-          channel_shuffle_op,
-          input.data(), output.data()));
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_run_operator(channel_shuffle_op, /*threadpool=*/nullptr));
-
-      // Verify results.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t g = 0; g < groups(); g++) {
-          for (size_t c = 0; c < group_channels(); c++) {
-            ASSERT_EQ(int32_t(input[i * input_stride() + g * group_channels() + c]),
-                int32_t(output[i * output_stride() + c * groups() + g]))
-              << "batch index " << i << ", group " << g << ", channel " << c;
-          }
-        }
-      }
-    }
-  }
-
-  void TestX32() const {
-    xnnpack::ReplicableRandomDevice rng;
-    std::uniform_int_distribution<uint32_t> u32dist;
-
-    xnnpack::Buffer<uint32_t> input(XNN_EXTRA_BYTES / sizeof(uint32_t) + (batch_size() - 1) * input_stride() + channels());
-    xnnpack::Buffer<uint32_t> output((batch_size() - 1) * output_stride() + channels());
-    for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      std::generate(input.begin(), input.end(), [&]() { return u32dist(rng); });
-
-      // Create, setup, run, and destroy Channel Shuffle operator.
-      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
-      xnn_operator_t channel_shuffle_op = nullptr;
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_create_channel_shuffle_nc_x32(
-          groups(), group_channels(),
-          input_stride(), output_stride(),
-          0, &channel_shuffle_op));
-      ASSERT_NE(nullptr, channel_shuffle_op);
-
-      // Smart pointer to automatically delete channel_shuffle_op.
-      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_channel_shuffle_op(channel_shuffle_op, xnn_delete_operator);
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_reshape_channel_shuffle_nc_x32(
-          channel_shuffle_op,
-          batch_size(),
-          /*threadpool=*/nullptr));
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_setup_channel_shuffle_nc_x32(
-          channel_shuffle_op,
-          input.data(), output.data()));
-
-      ASSERT_EQ(xnn_status_success,
-        xnn_run_operator(channel_shuffle_op, /*threadpool=*/nullptr));
-
-      // Verify results.
-      for (size_t i = 0; i < batch_size(); i++) {
-        for (size_t g = 0; g < groups(); g++) {
-          for (size_t c = 0; c < group_channels(); c++) {
-            ASSERT_EQ(input[i * input_stride() + g * group_channels() + c],
-                output[i * output_stride() + c * groups() + g])
-              << "batch index " << i << ", group " << g << ", channel " << c;
-          }
-        }
-      }
-    }
-  }
-
- private:
-  size_t groups_{1};
-  size_t group_channels_{1};
-  size_t batch_size_{1};
-  size_t input_stride_{0};
-  size_t output_stride_{0};
-  size_t iterations_{15};
-};