diff --git a/.travis.yml b/.travis.yml
index 84aebf317..274f6ed5c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -169,10 +169,10 @@ matrix:
     # BENCHMARKS:
     - name: "Benchmarks - x86_64-unknown-linux-gnu"
       install: TARGET=x86_64-unknown-linux-gnu ./ci/setup_benchmarks.sh
-      script: PATH=$(pwd):$PATH NORUN=1 VERIFY=1 FEATURES=coresimd,ispc,sleef-sys ci/benchmark.sh
+      script: PATH=$(pwd):$PATH NORUN=1 VERIFY=1 FEATURES=core_arch,ispc,sleef-sys ci/benchmark.sh
     - name: "Benchmarks - x86_64-apple-darwin"
       install: TARGET=x86_64-apple-darwin ./ci/setup_benchmarks.sh
-      script: PATH=$(pwd):$PATH NORUN=1 VERIFY=1 FEATURES=coresimd,ispc,sleef-sys ci/benchmark.sh
+      script: PATH=$(pwd):$PATH NORUN=1 VERIFY=1 FEATURES=core_arch,ispc,sleef-sys ci/benchmark.sh
       os: osx
       osx_image: xcode9.4
     # TOOLS:
diff --git a/Cargo.toml b/Cargo.toml
index aae8eb17a..df076edd7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,7 +21,7 @@ maintenance = { status = "experimental" }
 
 [dependencies]
 cfg-if = "^0.1"
-coresimd = { version = "^0.1.0", optional = true }
+core_arch = { version = "^0.1.3", optional = true }
 
 [features]
 default = []
@@ -32,7 +32,7 @@ paste = "^0.1.3"
 arrayvec = { version = "^0.4", default-features = false }
 
 [target.'cfg(target_arch = "x86_64")'.dependencies.sleef-sys]
-version = "^0.1"
+version = "^0.1.2"
 optional = true
 
 [target.wasm32-unknown-unknown.dev-dependencies]
diff --git a/ci/android-install-ndk.sh b/ci/android-install-ndk.sh
index 873f6c52c..818e78446 100644
--- a/ci/android-install-ndk.sh
+++ b/ci/android-install-ndk.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/env sh
 # Copyright 2016 The Rust Project Developers. See the COPYRIGHT
 # file at the top-level directory of this distribution and at
 # http://rust-lang.org/COPYRIGHT.
@@ -11,27 +11,27 @@
 
 set -ex
 
-curl -O https://dl.google.com/android/repository/android-ndk-r15b-linux-x86_64.zip
+curl --retry 5 -O https://dl.google.com/android/repository/android-ndk-r15b-linux-x86_64.zip
 unzip -q android-ndk-r15b-linux-x86_64.zip
 
 case "$1" in
-  aarch64)
-    arch=arm64
-    ;;
+    aarch64)
+        arch=arm64
+        ;;
 
-  i686)
-    arch=x86
-    ;;
+    i686)
+        arch=x86
+        ;;
 
-  *)
-    arch=$1
-    ;;
+    *)
+        arch=$1
+        ;;
 esac;
 
 android-ndk-r15b/build/tools/make_standalone_toolchain.py \
-        --unified-headers \
-        --install-dir /android/ndk-$1 \
-        --arch $arch \
-        --api 24
+    --unified-headers \
+    --install-dir "/android/ndk-${1}" \
+    --arch "${arch}" \
+    --api 24
 
 rm -rf ./android-ndk-r15b-linux-x86_64.zip ./android-ndk-r15b
diff --git a/ci/android-install-sdk.sh b/ci/android-install-sdk.sh
index ab7e14d95..6b5ac09ab 100644
--- a/ci/android-install-sdk.sh
+++ b/ci/android-install-sdk.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/usr/bin/env sh
 # Copyright 2016 The Rust Project Developers. See the COPYRIGHT
 # file at the top-level directory of this distribution and at
 # http://rust-lang.org/COPYRIGHT.
@@ -19,7 +19,7 @@ set -ex
 # which apparently magically accepts the licenses.
 
 mkdir sdk
-curl https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip -O
+curl --retry 5 https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip -O
 unzip -d sdk sdk-tools-linux-3859397.zip
 
 case "$1" in
@@ -46,9 +46,9 @@ case "$1" in
 esac;
 
 # --no_https avoids
-# javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: No trusted certificate found
-echo "yes" | \
-    ./sdk/tools/bin/sdkmanager --no_https \
+     # javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: No trusted certificate found
+yes | ./sdk/tools/bin/sdkmanager --licenses --no_https
+yes | ./sdk/tools/bin/sdkmanager --no_https \
         "emulator" \
         "platform-tools" \
         "platforms;android-24" \
@@ -56,5 +56,5 @@ echo "yes" | \
 
 echo "no" |
     ./sdk/tools/bin/avdmanager create avd \
-        --name $1 \
+        --name "${1}" \
         --package "system-images;android-24;default;$abi"
diff --git a/ci/android-sysimage.sh b/ci/android-sysimage.sh
index 9611dfeb0..9eabd7c8d 100644
--- a/ci/android-sysimage.sh
+++ b/ci/android-sysimage.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # Copyright 2017 The Rust Project Developers. See the COPYRIGHT
 # file at the top-level directory of this distribution and at
 # http://rust-lang.org/COPYRIGHT.
@@ -13,32 +15,34 @@ set -ex
 URL=https://dl.google.com/android/repository/sys-img/android
 
 main() {
-    local arch=$1
-    local name=$2
+    local arch="${1}"
+    local name="${2}"
     local dest=/system
-    local td=$(mktemp -d)
+    local td
+    td="$(mktemp -d)"
 
     apt-get install --no-install-recommends e2tools
 
-    pushd $td
-    curl -O $URL/$name
-    unzip -q $name
+    pushd "${td}"
+    curl --retry 5 -O "${URL}/${name}"
+    unzip -q "${name}"
 
-    local system=$(find . -name system.img)
-    mkdir -p $dest/{bin,lib,lib64}
+    local system
+    system="$(find . -name system.img)"
+    mkdir -p ${dest}/{bin,lib,lib64}
 
     # Extract android linker and libraries to /system
     # This allows android executables to be run directly (or with qemu)
-    if [ $arch = "x86_64" -o $arch = "arm64" ]; then
-        e2cp -p $system:/bin/linker64 $dest/bin/
-        e2cp -p $system:/lib64/libdl.so $dest/lib64/
-        e2cp -p $system:/lib64/libc.so $dest/lib64/
-        e2cp -p $system:/lib64/libm.so $dest/lib64/
+    if [ "${arch}" = "x86_64" ] || [ "${arch}" = "arm64" ]; then
+        e2cp -p "${system}:/bin/linker64" "${dest}/bin/"
+        e2cp -p "${system}:/lib64/libdl.so" "${dest}/lib64/"
+        e2cp -p "${system}:/lib64/libc.so" "${dest}/lib64/"
+        e2cp -p "${system}:/lib64/libm.so" "${dest}/lib64/"
     else
-        e2cp -p $system:/bin/linker $dest/bin/
-        e2cp -p $system:/lib/libdl.so $dest/lib/
-        e2cp -p $system:/lib/libc.so $dest/lib/
-        e2cp -p $system:/lib/libm.so $dest/lib/
+        e2cp -p "${system}:/bin/linker" "${dest}/bin/"
+        e2cp -p "${system}:/lib/libdl.so" "${dest}/lib/"
+        e2cp -p "${system}:/lib/libc.so" "${dest}/lib/"
+        e2cp -p "${system}:/lib/libm.so" "${dest}/lib/"
     fi
 
     # clean up
@@ -46,7 +50,7 @@ main() {
 
     popd
 
-    rm -rf $td
+    rm -rf "${td}"
 }
 
 main "${@}"
diff --git a/ci/benchmark.sh b/ci/benchmark.sh
index d4b821a4d..3635b9e37 100755
--- a/ci/benchmark.sh
+++ b/ci/benchmark.sh
@@ -2,7 +2,7 @@
 #
 # Runs all benchmarks. Controlled by the following environment variables:
 #
-# FEATURES={} - cargo features to pass to all benchmarks (e.g. coresimd,sleef-sys,ispc)
+# FEATURES={} - cargo features to pass to all benchmarks (e.g. core_arch,sleef-sys,ispc)
 # NORUN={1}   - only builds the benchmarks
 
 set -ex
diff --git a/ci/run.sh b/ci/run.sh
index fefad3409..7bb825883 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -6,7 +6,7 @@ set -ex
 
 # Tests are all super fast anyway, and they fault often enough on travis that
 # having only one thread increases debuggability to be worth it.
-export RUST_TEST_THREADS=1
+#export RUST_TEST_THREADS=1
 #export RUST_BACKTRACE=full
 #export RUST_TEST_NOCAPTURE=1
 
@@ -47,6 +47,13 @@ echo "RUST_TEST_NOCAPTURE=${RUST_TEST_NOCAPTURE}"
 
 cargo_test() {
     cmd="cargo ${CARGO_SUBCMD} --verbose --target=${TARGET} ${@}"
+    if [ "${NORUN}" != "1" ]
+    then
+        if [ "$TARGET" != "wasm32-unknown-unknown" ]
+        then
+            cmd="$cmd -- --quiet"
+        fi
+    fi
     mkdir target || true
     ${cmd} 2>&1 | tee > target/output
     if [[ ${PIPESTATUS[0]} != 0 ]]; then
@@ -71,9 +78,9 @@ fi
 
 if [[ "${TARGET}" == "x86_64-unknown-linux-gnu" ]] || [[ "${TARGET}" == "x86_64-pc-windows-msvc" ]]; then
     # use sleef on linux and windows x86_64 builds
-    cargo_test_impl --release --features=into_bits,coresimd,sleef-sys
+    cargo_test_impl --release --features=into_bits,core_arch,sleef-sys
 else
-    cargo_test_impl --release --features=into_bits,coresimd
+    cargo_test_impl --release --features=into_bits,core_arch
 fi
 
 # Verify code generation
diff --git a/ci/runtest-android.rs b/ci/runtest-android.rs
index d8968f99f..ed1cd80c8 100644
--- a/ci/runtest-android.rs
+++ b/ci/runtest-android.rs
@@ -3,8 +3,12 @@ use std::process::Command;
 use std::path::{Path, PathBuf};
 
 fn main() {
-    assert_eq!(env::args_os().len(), 2);
-    let test = PathBuf::from(env::args_os().nth(1).unwrap());
+    let args = env::args_os()
+        .skip(1)
+        .filter(|arg| arg != "--quiet")
+        .collect::<Vec<_>>();
+    assert_eq!(args.len(), 1);
+    let test = PathBuf::from(&args[0]);
     let dst = Path::new("/data/local/tmp").join(test.file_name().unwrap());
 
     let status = Command::new("adb")
diff --git a/examples/aobench/Cargo.toml b/examples/aobench/Cargo.toml
index e3b34d831..b990a8afb 100644
--- a/examples/aobench/Cargo.toml
+++ b/examples/aobench/Cargo.toml
@@ -33,7 +33,7 @@ criterion = { version = '^0.2', features=['real_blackbox'] }
 default = [ "256bit" ]
 256bit = []
 sleef-sys = [ "packed_simd/sleef-sys" ]
-coresimd = [ "packed_simd/coresimd" ]
+core_arch = [ "packed_simd/core_arch" ]
 
 [[bench]]
 name = "isec_sphere"
diff --git a/examples/mandelbrot/Cargo.toml b/examples/mandelbrot/Cargo.toml
index e9f85a69b..ab9b17f5c 100644
--- a/examples/mandelbrot/Cargo.toml
+++ b/examples/mandelbrot/Cargo.toml
@@ -25,4 +25,4 @@ path = "src/lib.rs"
 [features]
 default = []
 sleef-sys = ["packed_simd/sleef-sys"]
-coresimd = ["packed_simd/coresimd"]
+core_arch = ["packed_simd/core_arch"]
diff --git a/examples/nbody/Cargo.toml b/examples/nbody/Cargo.toml
index 5913e0d11..81c284d06 100644
--- a/examples/nbody/Cargo.toml
+++ b/examples/nbody/Cargo.toml
@@ -14,3 +14,8 @@ path = "src/main.rs"
 [lib]
 name = "nbody_lib"
 path = "src/lib.rs"
+
+[features]
+default = [ ]
+sleef-sys = [ "packed_simd/sleef-sys" ]
+core_arch = [ "packed_simd/core_arch" ]
diff --git a/examples/options_pricing/Cargo.toml b/examples/options_pricing/Cargo.toml
index 7d9e98ad6..1bc0a7d11 100644
--- a/examples/options_pricing/Cargo.toml
+++ b/examples/options_pricing/Cargo.toml
@@ -23,6 +23,6 @@ path = "src/lib.rs"
 
 [features]
 default = []
-coresimd = [ "packed_simd/coresimd" ]
+core_arch = [ "packed_simd/core_arch" ]
 sleef-sys = [ "packed_simd/sleef-sys" ]
 ispc_libm = [ "ispc" ]
diff --git a/examples/stencil/Cargo.toml b/examples/stencil/Cargo.toml
index f44a41e9c..c1bf8c89d 100644
--- a/examples/stencil/Cargo.toml
+++ b/examples/stencil/Cargo.toml
@@ -23,5 +23,5 @@ path = "src/lib.rs"
 
 [features]
 default = []
-coresimd = ["packed_simd/coresimd"]
+core_arch = ["packed_simd/core_arch"]
 sleef-sys = ["packed_simd/sleef-sys"]
diff --git a/src/api/bit_manip.rs b/src/api/bit_manip.rs
index b87e1808c..3d3c4eb88 100644
--- a/src/api/bit_manip.rs
+++ b/src/api/bit_manip.rs
@@ -17,15 +17,15 @@ macro_rules! impl_bit_manip {
                 super::codegen::bit_manip::BitManip::ctpop(!self)
             }
 
-            /// Returns the number of leading zeros in the binary representation
-            /// of the lanes of `self`.
+            /// Returns the number of leading zeros in the binary
+            /// representation of the lanes of `self`.
             #[inline]
             pub fn leading_zeros(self) -> Self {
                 super::codegen::bit_manip::BitManip::ctlz(self)
             }
 
-            /// Returns the number of trailing zeros in the binary representation
-            /// of the lanes of `self`.
+            /// Returns the number of trailing zeros in the binary
+            /// representation of the lanes of `self`.
             #[inline]
             pub fn trailing_zeros(self) -> Self {
                 super::codegen::bit_manip::BitManip::cttz(self)
@@ -45,7 +45,10 @@ macro_rules! impl_bit_manip {
                         ($x:expr, $func:ident) => {{
                             let mut actual = $x;
                             for i in 0..$id::lanes() {
-                                actual = actual.replace(i, $x.extract(i).$func() as $elem_ty);
+                                actual = actual.replace(
+                                    i,
+                                    $x.extract(i).$func() as $elem_ty
+                                );
                             }
                             let expected = $x.$func();
                             assert_eq!(actual, expected);
@@ -73,21 +76,24 @@ macro_rules! impl_bit_manip {
                         $id::from_slice_unaligned(elems)
                     }
 
-                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+                    #[cfg_attr(not(target_arch = "wasm32"), test)]
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                     fn count_ones() {
                         test_func!($id::splat(0), count_ones);
                         test_func!($id::splat(!0), count_ones);
                         test_func!(load_bytes(), count_ones);
                     }
 
-                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+                    #[cfg_attr(not(target_arch = "wasm32"), test)]
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                     fn count_zeros() {
                         test_func!($id::splat(0), count_zeros);
                         test_func!($id::splat(!0), count_zeros);
                         test_func!(load_bytes(), count_zeros);
                     }
 
-                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+                    #[cfg_attr(not(target_arch = "wasm32"), test)]
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                     fn leading_zeros() {
                         test_func!($id::splat(0), leading_zeros);
                         test_func!($id::splat(1), leading_zeros);
@@ -95,17 +101,24 @@ macro_rules! impl_bit_manip {
                         // behavior when the 8th bit is set.
                         test_func!($id::splat(0b1000_0010), leading_zeros);
                         test_func!($id::splat(!0), leading_zeros);
-                        test_func!($id::splat(1 << (LANE_WIDTH - 1)), leading_zeros);
+                        test_func!(
+                            $id::splat(1 << (LANE_WIDTH - 1)),
+                            leading_zeros
+                        );
                         test_func!(load_bytes(), leading_zeros);
                     }
 
-                    #[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+                    #[cfg_attr(not(target_arch = "wasm32"), test)]
+                    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
                     fn trailing_zeros() {
                         test_func!($id::splat(0), trailing_zeros);
                         test_func!($id::splat(1), trailing_zeros);
                         test_func!($id::splat(0b1000_0010), trailing_zeros);
                         test_func!($id::splat(!0), trailing_zeros);
-                        test_func!($id::splat(1 << (LANE_WIDTH - 1)), trailing_zeros);
+                        test_func!(
+                            $id::splat(1 << (LANE_WIDTH - 1)),
+                            trailing_zeros
+                        );
                         test_func!(load_bytes(), trailing_zeros);
                     }
                 }
diff --git a/src/api/from/from_array.rs b/src/api/from/from_array.rs
index eb5f53c45..964d1501d 100644
--- a/src/api/from/from_array.rs
+++ b/src/api/from/from_array.rs
@@ -53,7 +53,7 @@ macro_rules! impl_from_array {
         }
         */
 
-        test_if!{
+        test_if! {
             $test_tt:
             paste::item! {
                 mod [<$id _from>] {
diff --git a/src/api/from/from_vector.rs b/src/api/from/from_vector.rs
index 2cda3f330..55f70016d 100644
--- a/src/api/from/from_vector.rs
+++ b/src/api/from/from_vector.rs
@@ -31,7 +31,7 @@ macro_rules! impl_from_vector {
            }
         */
 
-        test_if!{
+        test_if! {
             $test_tt:
             paste::item! {
                 pub mod [<$id _from_ $source>] {
diff --git a/src/api/into_bits/arch_specific.rs b/src/api/into_bits/arch_specific.rs
index e4c412162..a93cdcd30 100644
--- a/src/api/into_bits/arch_specific.rs
+++ b/src/api/into_bits/arch_specific.rs
@@ -38,13 +38,13 @@ macro_rules! impl_arch {
         #[cfg(any(
             not(target_arch = "arm"),
             all(target_feature = "v7", target_feature = "neon",
-                feature = "coresimd"))
+                feature = "core_arch"))
         )]
         // note: if target is "powerpc", "altivec" must be enabled
         // and the std library must be recompiled with it
         #[cfg(any(
             not(target_arch = "powerpc"),
-            all(target_feature = "altivec", feature = "coresimd"),
+            all(target_feature = "altivec", feature = "core_arch"),
         ))]
         #[cfg(target_arch = $arch_tt)]
         use crate::arch::$arch::{
@@ -54,11 +54,11 @@ macro_rules! impl_arch {
         #[cfg(any(
             not(target_arch = "arm"),
             all(target_feature = "v7", target_feature = "neon",
-                feature = "coresimd"))
+                feature = "core_arch"))
         )]
         #[cfg(any(
             not(target_arch = "powerpc"),
-            all(target_feature = "altivec", feature = "coresimd"),
+            all(target_feature = "altivec", feature = "core_arch"),
         ))]
         #[cfg(target_arch = $arch_tt)]
         impl_arch!($($arch_ty),* | $($from_ty),* | $($into_ty),* |
diff --git a/src/api/into_bits/macros.rs b/src/api/into_bits/macros.rs
index 48d6b6e2f..8cec5b004 100644
--- a/src/api/into_bits/macros.rs
+++ b/src/api/into_bits/macros.rs
@@ -9,7 +9,7 @@ macro_rules! impl_from_bits_ {
             }
         }
 
-        test_if!{
+        test_if! {
             $test_tt:
             paste::item! {
                 pub mod [<$id _from_bits_ $from_ty>] {
diff --git a/src/api/reductions/float_arithmetic.rs b/src/api/reductions/float_arithmetic.rs
index 95fb3606c..dd722ae25 100644
--- a/src/api/reductions/float_arithmetic.rs
+++ b/src/api/reductions/float_arithmetic.rs
@@ -90,7 +90,7 @@ macro_rules! impl_reduction_float_arithmetic {
             }
         }
 
-        test_if!{
+        test_if! {
             $test_tt:
             paste::item! {
                 pub mod [<$id _reduction_float_arith>] {
diff --git a/src/api/reductions/integer_arithmetic.rs b/src/api/reductions/integer_arithmetic.rs
index a26608318..91dffad31 100644
--- a/src/api/reductions/integer_arithmetic.rs
+++ b/src/api/reductions/integer_arithmetic.rs
@@ -95,7 +95,7 @@ macro_rules! impl_reduction_integer_arithmetic {
             }
         }
 
-        test_if!{
+        test_if! {
             $test_tt:
             paste::item! {
                 pub mod [<$id _reduction_int_arith>] {
diff --git a/src/api/reductions/min_max.rs b/src/api/reductions/min_max.rs
index bee9e1e2b..c4d3aa10f 100644
--- a/src/api/reductions/min_max.rs
+++ b/src/api/reductions/min_max.rs
@@ -74,7 +74,7 @@ macro_rules! impl_reduction_min_max {
                 }
             }
         }
-        test_if!{$test_tt:
+        test_if! {$test_tt:
         paste::item! {
             pub mod [<$id _reduction_min_max>] {
                 use super::*;
diff --git a/src/api/slice/from_slice.rs b/src/api/slice/from_slice.rs
index 0208cf318..ca83c7df7 100644
--- a/src/api/slice/from_slice.rs
+++ b/src/api/slice/from_slice.rs
@@ -84,7 +84,7 @@ macro_rules! impl_slice_from_slice {
             }
         }
 
-        test_if!{
+        test_if! {
             $test_tt:
             paste::item! {
                 pub mod [<$id _slice_from_slice>] {
diff --git a/src/api/slice/write_to_slice.rs b/src/api/slice/write_to_slice.rs
index bf79849a4..becb564d4 100644
--- a/src/api/slice/write_to_slice.rs
+++ b/src/api/slice/write_to_slice.rs
@@ -55,8 +55,16 @@ macro_rules! impl_slice_write_to_slice {
                     0
                 );
 
-                        #[cfg_attr(feature = "cargo-clippy",
-                                   allow(clippy::cast_ptr_alignment))]
+                                #[cfg_attr(feature = "cargo-clippy",
+                                           allow(clippy::cast_ptr_alignment))]
+                        #[cfg_attr(
+                            feature = "cargo-clippy",
+                            allow(clippy::cast_ptr_alignment)
+                        )]
+                #[cfg_attr(
+                    feature = "cargo-clippy",
+                    allow(clippy::cast_ptr_alignment)
+                )]
                 #[cfg_attr(
                     feature = "cargo-clippy",
                     allow(clippy::cast_ptr_alignment)
@@ -85,7 +93,7 @@ macro_rules! impl_slice_write_to_slice {
             }
         }
 
-        test_if!{
+        test_if! {
             $test_tt:
             paste::item! {
                 pub mod [<$id _slice_write_to_slice>] {
diff --git a/src/codegen.rs b/src/codegen.rs
index 72be7f1b2..b7ccd8386 100644
--- a/src/codegen.rs
+++ b/src/codegen.rs
@@ -1,12 +1,12 @@
 //! Code-generation utilities
 
+crate mod bit_manip;
 crate mod llvm;
 crate mod math;
 crate mod reductions;
 crate mod shuffle;
 crate mod shuffle1_dyn;
 crate mod swap_bytes;
-crate mod bit_manip;
 
 macro_rules! impl_simd_array {
     ([$elem_ty:ident; $elem_count:expr]:
diff --git a/src/codegen/bit_manip.rs b/src/codegen/bit_manip.rs
index 9700a5a1e..947266f5b 100644
--- a/src/codegen/bit_manip.rs
+++ b/src/codegen/bit_manip.rs
@@ -1,4 +1,5 @@
 //! LLVM bit manipulation intrinsics.
+#![rustfmt::skip]
 
 use crate::*;
 
@@ -51,7 +52,6 @@ extern "C" {
     #[link_name = "llvm.ctlz.v4i128"]
     fn ctlz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;
 
-
     #[link_name = "llvm.cttz.v2i8"]
     fn cttz_u8x2(x: u8x2, is_zero_undef: bool) -> u8x2;
     #[link_name = "llvm.cttz.v4i8"]
@@ -99,7 +99,6 @@ extern "C" {
     #[link_name = "llvm.cttz.v4i128"]
     fn cttz_u128x4(x: u128x4, is_zero_undef: bool) -> u128x4;
 
-
     #[link_name = "llvm.ctpop.v2i8"]
     fn ctpop_u8x2(x: u8x2) -> u8x2;
     #[link_name = "llvm.ctpop.v4i8"]
@@ -155,7 +154,8 @@ crate trait BitManip {
 }
 
 macro_rules! impl_bit_manip {
-    (inner: $ty:ident, $scalar:ty, $uty:ident, $ctpop:ident, $ctlz:ident, $cttz:ident) => {
+    (inner: $ty:ident, $scalar:ty, $uty:ident,
+     $ctpop:ident, $ctlz:ident, $cttz:ident) => {
         // FIXME: several LLVM intrinsics break on s390x https://github.com/rust-lang-nursery/packed_simd/issues/192
         #[cfg(target_arch = "s390x")]
         impl_bit_manip! { scalar: $ty, $scalar }
@@ -170,7 +170,8 @@ macro_rules! impl_bit_manip {
             #[inline]
             fn ctlz(self) -> Self {
                 let y: $uty = self.cast();
-                // the ctxx intrinsics need compile-time constant `is_zero_undef`
+                // the ctxx intrinsics need compile-time constant
+                // `is_zero_undef`
                 unsafe { $ctlz(y, false).cast() }
             }
 
@@ -211,7 +212,8 @@ macro_rules! impl_bit_manip {
             fn ctpop(self) -> Self {
                 let mut ones = self;
                 for i in 0..Self::lanes() {
-                    ones = ones.replace(i, self.extract(i).count_ones() as $scalar);
+                    ones = ones
+                        .replace(i, self.extract(i).count_ones() as $scalar);
                 }
                 ones
             }
@@ -220,7 +222,10 @@ macro_rules! impl_bit_manip {
             fn ctlz(self) -> Self {
                 let mut lz = self;
                 for i in 0..Self::lanes() {
-                    lz = lz.replace(i, self.extract(i).leading_zeros() as $scalar);
+                    lz = lz.replace(
+                        i,
+                        self.extract(i).leading_zeros() as $scalar,
+                    );
                 }
                 lz
             }
@@ -229,44 +234,49 @@ macro_rules! impl_bit_manip {
             fn cttz(self) -> Self {
                 let mut tz = self;
                 for i in 0..Self::lanes() {
-                    tz = tz.replace(i, self.extract(i).trailing_zeros() as $scalar);
+                    tz = tz.replace(
+                        i,
+                        self.extract(i).trailing_zeros() as $scalar,
+                    );
                 }
                 tz
             }
         }
     };
-    ($uty:ident, $uscalar:ty, $ity:ident, $iscalar:ty, $ctpop:ident, $ctlz:ident, $cttz:ident) => {
+    ($uty:ident, $uscalar:ty, $ity:ident, $iscalar:ty,
+     $ctpop:ident, $ctlz:ident, $cttz:ident) => {
         impl_bit_manip! { inner: $uty, $uscalar, $uty, $ctpop, $ctlz, $cttz }
         impl_bit_manip! { inner: $ity, $iscalar, $uty, $ctpop, $ctlz, $cttz }
     };
-    (sized: $usize:ident, $uscalar:ty, $isize:ident, $iscalar:ty, $ty:ident) => {
+    (sized: $usize:ident, $uscalar:ty, $isize:ident,
+     $iscalar:ty, $ty:ident) => {
         impl_bit_manip! { sized_inner: $usize, $uscalar, $ty }
         impl_bit_manip! { sized_inner: $isize, $iscalar, $ty }
     };
 }
 
-impl_bit_manip! { u8x2,   u8, i8x2, i8,   ctpop_u8x2,   ctlz_u8x2,   cttz_u8x2   }
-impl_bit_manip! { u8x4,   u8, i8x4, i8,   ctpop_u8x4,   ctlz_u8x4,   cttz_u8x4   }
+impl_bit_manip! { u8x2   ,   u8, i8x2, i8,   ctpop_u8x2,   ctlz_u8x2,   cttz_u8x2   }
+impl_bit_manip! { u8x4   ,   u8, i8x4, i8,   ctpop_u8x4,   ctlz_u8x4,   cttz_u8x4   }
 #[cfg(not(target_arch = "aarch64"))] // see below
-impl_bit_manip! { u8x8,   u8, i8x8, i8,   ctpop_u8x8,   ctlz_u8x8,   cttz_u8x8   }
-impl_bit_manip! { u8x16,  u8, i8x16, i8,  ctpop_u8x16,  ctlz_u8x16,  cttz_u8x16  }
-impl_bit_manip! { u8x32,  u8, i8x32, i8,  ctpop_u8x32,  ctlz_u8x32,  cttz_u8x32  }
-impl_bit_manip! { u8x64,  u8, i8x64, i8,  ctpop_u8x64,  ctlz_u8x64,  cttz_u8x64  }
-impl_bit_manip! { u16x2,  u16, i16x2, i16,  ctpop_u16x2,  ctlz_u16x2,  cttz_u16x2  }
-impl_bit_manip! { u16x4,  u16, i16x4, i16,  ctpop_u16x4,  ctlz_u16x4,  cttz_u16x4  }
-impl_bit_manip! { u16x8,  u16, i16x8, i16,  ctpop_u16x8,  ctlz_u16x8,  cttz_u16x8  }
-impl_bit_manip! { u16x16, u16, i16x16, i16, ctpop_u16x16, ctlz_u16x16, cttz_u16x16 }
-impl_bit_manip! { u16x32, u16, i16x32, i16, ctpop_u16x32, ctlz_u16x32, cttz_u16x32 }
-impl_bit_manip! { u32x2,  u32, i32x2, i32,  ctpop_u32x2,  ctlz_u32x2,  cttz_u32x2  }
-impl_bit_manip! { u32x4,  u32, i32x4, i32,  ctpop_u32x4,  ctlz_u32x4,  cttz_u32x4  }
-impl_bit_manip! { u32x8,  u32, i32x8, i32,  ctpop_u32x8,  ctlz_u32x8,  cttz_u32x8  }
-impl_bit_manip! { u32x16, u32, i32x16, i32, ctpop_u32x16, ctlz_u32x16, cttz_u32x16 }
-impl_bit_manip! { u64x2,  u64, i64x2, i64,  ctpop_u64x2,  ctlz_u64x2,  cttz_u64x2  }
-impl_bit_manip! { u64x4,  u64, i64x4, i64,  ctpop_u64x4,  ctlz_u64x4,  cttz_u64x4  }
-impl_bit_manip! { u64x8,  u64, i64x8, i64,  ctpop_u64x8,  ctlz_u64x8,  cttz_u64x8  }
-impl_bit_manip! { u128x1, u128, i128x1, i128, ctpop_u128x1, ctlz_u128x1, cttz_u128x1 }
-impl_bit_manip! { u128x2, u128, i128x2, i128, ctpop_u128x2, ctlz_u128x2, cttz_u128x2 }
-impl_bit_manip! { u128x4, u128, i128x4, i128, ctpop_u128x4, ctlz_u128x4, cttz_u128x4 }
+impl_bit_manip! { u8x8   ,   u8, i8x8, i8,   ctpop_u8x8,   ctlz_u8x8,   cttz_u8x8   }
+impl_bit_manip! { u8x16  ,  u8, i8x16, i8,  ctpop_u8x16,  ctlz_u8x16,  cttz_u8x16  }
+impl_bit_manip! { u8x32  ,  u8, i8x32, i8,  ctpop_u8x32,  ctlz_u8x32,  cttz_u8x32  }
+impl_bit_manip! { u8x64  ,  u8, i8x64, i8,  ctpop_u8x64,  ctlz_u8x64,  cttz_u8x64  }
+impl_bit_manip! { u16x2  ,  u16, i16x2, i16,  ctpop_u16x2,  ctlz_u16x2,  cttz_u16x2  }
+impl_bit_manip! { u16x4  ,  u16, i16x4, i16,  ctpop_u16x4,  ctlz_u16x4,  cttz_u16x4  }
+impl_bit_manip! { u16x8  ,  u16, i16x8, i16,  ctpop_u16x8,  ctlz_u16x8,  cttz_u16x8  }
+impl_bit_manip! { u16x16 , u16, i16x16, i16, ctpop_u16x16, ctlz_u16x16, cttz_u16x16 }
+impl_bit_manip! { u16x32 , u16, i16x32, i16, ctpop_u16x32, ctlz_u16x32, cttz_u16x32 }
+impl_bit_manip! { u32x2  ,  u32, i32x2, i32,  ctpop_u32x2,  ctlz_u32x2,  cttz_u32x2  }
+impl_bit_manip! { u32x4  ,  u32, i32x4, i32,  ctpop_u32x4,  ctlz_u32x4,  cttz_u32x4  }
+impl_bit_manip! { u32x8  ,  u32, i32x8, i32,  ctpop_u32x8,  ctlz_u32x8,  cttz_u32x8  }
+impl_bit_manip! { u32x16 , u32, i32x16, i32, ctpop_u32x16, ctlz_u32x16, cttz_u32x16 }
+impl_bit_manip! { u64x2  ,  u64, i64x2, i64,  ctpop_u64x2,  ctlz_u64x2,  cttz_u64x2  }
+impl_bit_manip! { u64x4  ,  u64, i64x4, i64,  ctpop_u64x4,  ctlz_u64x4,  cttz_u64x4  }
+impl_bit_manip! { u64x8  ,  u64, i64x8, i64,  ctpop_u64x8,  ctlz_u64x8,  cttz_u64x8  }
+impl_bit_manip! { u128x1 , u128, i128x1, i128, ctpop_u128x1, ctlz_u128x1, cttz_u128x1 }
+impl_bit_manip! { u128x2 , u128, i128x2, i128, ctpop_u128x2, ctlz_u128x2, cttz_u128x2 }
+impl_bit_manip! { u128x4 , u128, i128x4, i128, ctpop_u128x4, ctlz_u128x4, cttz_u128x4 }
 
 #[cfg(target_arch = "aarch64")]
 impl BitManip for u8x8 {
@@ -285,7 +295,8 @@ impl BitManip for u8x8 {
     #[inline]
     fn cttz(self) -> Self {
         // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191
-        // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64 intrinsics
+        // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64
+        // intrinsics
         let mut tz = self;
         for i in 0..Self::lanes() {
             tz = tz.replace(i, self.extract(i).trailing_zeros() as u8);
@@ -310,7 +321,8 @@ impl BitManip for i8x8 {
     #[inline]
     fn cttz(self) -> Self {
         // FIXME: LLVM cttz.v8i8 broken on aarch64 https://github.com/rust-lang-nursery/packed_simd/issues/191
-        // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64 intrinsics
+        // OPTIMIZE: adapt the algorithm used for v8i16/etc to Rust's aarch64
+        // intrinsics
         let mut tz = self;
         for i in 0..Self::lanes() {
             tz = tz.replace(i, self.extract(i).trailing_zeros() as i8);
diff --git a/src/codegen/reductions/mask.rs b/src/codegen/reductions/mask.rs
index 9e5e6e418..498817ad8 100644
--- a/src/codegen/reductions/mask.rs
+++ b/src/codegen/reductions/mask.rs
@@ -23,7 +23,7 @@ cfg_if! {
         #[macro_use]
         mod x86;
     } else if #[cfg(all(target_arch = "arm", target_feature = "v7",
-                        target_feature = "neon", feature = "coresimd"))] {
+                        target_feature = "neon", feature = "core_arch"))] {
         #[macro_use]
         mod arm;
     } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
diff --git a/src/codegen/reductions/mask/x86.rs b/src/codegen/reductions/mask/x86.rs
index 89f500b03..2ae4ed81c 100644
--- a/src/codegen/reductions/mask/x86.rs
+++ b/src/codegen/reductions/mask/x86.rs
@@ -1,9 +1,5 @@
 //! Mask reductions implementation for `x86` and `x86_64` targets
 
-#[cfg(target_feature = "mmx")]
-#[macro_use]
-mod mmx;
-
 #[cfg(target_feature = "sse")]
 #[macro_use]
 mod sse;
@@ -24,8 +20,8 @@ mod avx2;
 macro_rules! x86_m8x8_impl {
     ($id:ident) => {
         cfg_if! {
-            if #[cfg(all(target_arch = "x86_64", target_feature = "mmx"))] {
-                x86_m8x8_mmx_impl!($id);
+            if #[cfg(all(target_arch = "x86_64", target_feature = "sse"))] {
+                x86_m8x8_sse_impl!($id);
             } else {
                 fallback_impl!($id);
             }
diff --git a/src/codegen/reductions/mask/x86/mmx.rs b/src/codegen/reductions/mask/x86/mmx.rs
deleted file mode 100644
index 3109f6f7b..000000000
--- a/src/codegen/reductions/mask/x86/mmx.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-//! Mask reductions implementation for `x86` and `x86_64` targets with `MMX`.
-#![allow(unused)]
-
-macro_rules! x86_m8x8_mmx_impl {
-    ($id:ident) => {
-        impl All for $id {
-            #[inline]
-            #[target_feature(enable = "mmx")]
-            unsafe fn all(self) -> bool {
-                #[cfg(target_arch = "x86")]
-                use crate::arch::x86::_mm_movemask_pi8;
-                #[cfg(target_arch = "x86_64")]
-                use crate::arch::x86_64::_mm_movemask_pi8;
-                // _mm_movemask_pi8(a) creates an 8bit mask containing the most
-                // significant bit of each byte of `a`. If all bits are set,
-                // then all 8 lanes of the mask are true.
-                _mm_movemask_pi8(crate::mem::transmute(self))
-                    == u8::max_value() as i32
-            }
-        }
-        impl Any for $id {
-            #[inline]
-            #[target_feature(enable = "mmx")]
-            unsafe fn any(self) -> bool {
-                #[cfg(target_arch = "x86")]
-                use crate::arch::x86::_mm_movemask_pi8;
-                #[cfg(target_arch = "x86_64")]
-                use crate::arch::x86_64::_mm_movemask_pi8;
-
-                _mm_movemask_pi8(crate::mem::transmute(self)) != 0
-            }
-        }
-    };
-}
diff --git a/src/codegen/reductions/mask/x86/sse.rs b/src/codegen/reductions/mask/x86/sse.rs
index eb1ef7fac..7482f9430 100644
--- a/src/codegen/reductions/mask/x86/sse.rs
+++ b/src/codegen/reductions/mask/x86/sse.rs
@@ -34,3 +34,35 @@ macro_rules! x86_m32x4_sse_impl {
         }
     };
 }
+
+macro_rules! x86_m8x8_sse_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm_movemask_pi8;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm_movemask_pi8;
+                // _mm_movemask_pi8(a) creates an 8bit mask containing the most
+                // significant bit of each byte of `a`. If all bits are set,
+                // then all 8 lanes of the mask are true.
+                _mm_movemask_pi8(crate::mem::transmute(self))
+                    == u8::max_value() as i32
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use crate::arch::x86::_mm_movemask_pi8;
+                #[cfg(target_arch = "x86_64")]
+                use crate::arch::x86_64::_mm_movemask_pi8;
+
+                _mm_movemask_pi8(crate::mem::transmute(self)) != 0
+            }
+        }
+    };
+}
diff --git a/src/codegen/shuffle1_dyn.rs b/src/codegen/shuffle1_dyn.rs
index d87fbdb85..adc856175 100644
--- a/src/codegen/shuffle1_dyn.rs
+++ b/src/codegen/shuffle1_dyn.rs
@@ -53,7 +53,7 @@ macro_rules! impl_shuffle1_dyn {
                     all(target_aarch = "aarch64", target_feature = "neon"),
                     all(target_aarch = "arm", target_feature = "v7",
                         target_feature = "neon")),
-                feature = "coresimd")
+                feature = "core_arch")
             )] {
                 impl Shuffle1Dyn for u8x8 {
                     type Indices = Self;
@@ -104,7 +104,7 @@ macro_rules! impl_shuffle1_dyn {
                     }
                 }
             } else if #[cfg(all(target_aarch = "aarch64", target_feature = "neon",
-                                feature = "coresimd"))] {
+                                feature = "core_arch"))] {
                 impl Shuffle1Dyn for u8x16 {
                     type Indices = Self;
                     #[inline]
@@ -123,7 +123,7 @@ macro_rules! impl_shuffle1_dyn {
                     }
                 }
             } else if #[cfg(all(target_aarch = "arm", target_feature = "v7",
-                                target_feature = "neon", feature = "coresimd"))] {
+                                target_feature = "neon", feature = "core_arch"))] {
                 impl Shuffle1Dyn for u8x16 {
                     type Indices = Self;
                     #[inline]
diff --git a/src/lib.rs b/src/lib.rs
index 3842d29c2..60867db6c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -125,10 +125,10 @@
 //! resulting vector contains the elements of `a` for those lanes for which the
 //! mask is `true`, and the elements of `b` otherwise.
 //!
-//! The example constructs a mask with the first two lanes set to `true` and the
-//! last two lanes set to `false`. This selects the first two lanes of `a + 1`
-//! and the last two lanes of `a`, producing a vector where the first two lanes
-//! have been incremented by `1`.
+//! The example constructs a mask with the first two lanes set to `true` and
+//! the last two lanes set to `false`. This selects the first two lanes of `a +
+//! 1` and the last two lanes of `a`, producing a vector where the first two
+//! lanes have been incremented by `1`.
 //!
 //! > note: mask `select` can be used on vector types that have the same number
 //! > of lanes as the mask. The example shows this by using [`m16x4`] instead
@@ -239,9 +239,9 @@ use cfg_if::cfg_if;
 
 cfg_if! {
     if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon",
-                 feature = "coresimd"))] {
+                 feature = "core_arch"))] {
         #[allow(unused_imports)]
-        use coresimd::arch;
+        use core_arch as arch;
     } else {
         #[allow(unused_imports)]
         use core::arch;
@@ -260,7 +260,6 @@ use core::{
 
 #[macro_use]
 mod testing;
-
 #[macro_use]
 mod api;
 mod codegen;
diff --git a/tests/endianness.rs b/tests/endianness.rs
index 2afd56173..1e6b4f354 100644
--- a/tests/endianness.rs
+++ b/tests/endianness.rs
@@ -224,9 +224,6 @@ fn endian_tuple_access() {
     assert_eq!(x.6, e[6]);
     assert_eq!(x.7, e[7]);
 
-    // Without repr(C) this produces total garbage.
-    // FIXME: investigate more, this is maybe due to
-    // to tuple field reordering to minimize padding.
     #[cfg_attr(rustfmt, rustfmt_skip)]
     #[repr(C)]
     #[derive(Copy ,Clone)]