From 47e324559d7a6cfa065f67e2026b8be2540da219 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 4 Jun 2020 08:22:14 -0700
Subject: [PATCH 01/15] Update and revamp wasm32 SIMD intrinsics

Lots of time and lots of things have happened since the simd128 support
was first added to this crate. Things are starting to settle down now so
this commit syncs the Rust intrinsic definitions with the current
specification (https://github.com/WebAssembly/simd). Unfortuantely not
everything can be enabled just yet but everything is in the pipeline for
getting enabled soon.

This commit also applies a major revamp to how intrinsics are tested.
The intention is that the setup should be much more lightweight and/or
easy to work with after this commit.

At a high-level, the changes here are:

* Testing with node.js and `#[wasm_bindgen]` has been removed. Instead
  intrinsics are tested with Wasmtime which has a nearly complete
  implementation of the SIMD spec (and soon fully complete!)

* Testing is switched to `wasm32-wasi` to make idiomatic Rust bits a bit
  easier to work with (e.g. `panic!)`

* Testing of this crate's simd128 feature for wasm is re-enabled. This
  will run on CI and both compile and execute intrinsics. This should
  bring wasm intrinsics to the same level of parity as x86 intrinsics,
  for example.

* New wasm intrinsics have been added:
  * `iNNxMM_loadAxA_{s,u}`
  * `vNNxMM_load_splat`
  * `v8x16_swizzle`
  * `v128_andnot`
  * `iNNxMM_abs`
  * `iNNxMM_narrow_*_{u,s}`
  * `iNNxMM_bitmask` - commented out until LLVM is updated to LLVM 11
  * `iNNxMM_widen_*_{u,s}` - commented out until
    bytecodealliance/wasmtime#1994 lands
  * `iNNxMM_{max,min}_{u,s}`
  * `iNNxMM_avgr_u`

* Some wasm intrinsics have been removed:
  * `i64x2_trunc_*`
  * `f64x2_convert_*`
  * `i8x16_mul`

* The `v8x16.shuffle` instruction is exposed. This is done through a
  `macro` (not `macro_rules!`, but `macro`). This is intended to be
  somewhat experimental and unstable until we decide otherwise. This
  instruction has 16 immediate-mode expressions and is as a result
  unsuited to the existing `constify_*` logic of this crate. I'm hoping
  that we can game out over time what a macro might look like and/or
  look for better solutions. For now, though, what's implemented is the
  first of its kind in this crate (an architecture-specific macro), so
  some extra scrutiny looking at it would be appreciated.

* Lots of `assert_instr` annotations have been fixed for wasm.

* All wasm simd128 tests are uncommented and passing now.

This is still missing tests for new intrinsics and it's also missing
tests for various corner cases. I hope to get to those later as the
upstream spec itself gets closer to stabilization.

In the meantime, however, I went ahead and updated the `hex.rs` example
with a wasm implementation using intrinsics. With it I got some very
impressive speedups using Wasmtime:

    test benches::large_default  ... bench:     213,961 ns/iter (+/- 5,108) = 4900 MB/s
    test benches::large_fallback ... bench:   3,108,434 ns/iter (+/- 75,730) = 337 MB/s
    test benches::small_default  ... bench:          52 ns/iter (+/- 0) = 2250 MB/s
    test benches::small_fallback ... bench:         358 ns/iter (+/- 0) = 326 MB/s

or otherwise using Wasmtime hex encoding using SIMD is 15x faster on 1MB
chunks or 7x faster on small <128byte chunks.

All of these intrinsics are still unstable and will continue to be so
presumably until the simd proposal in wasm itself progresses to a later
stage. Additionaly we'll still want to sync with clang on intrinsic
names (or decide not to) at some point in the future.
---
 .github/workflows/main.yml                    |    4 +-
 ci/docker/wasm32-unknown-unknown/Dockerfile   |   25 -
 .../wasm32-unknown-unknown/wasm-entrypoint.sh |   15 -
 ci/docker/wasm32-wasi/Dockerfile              |   22 +
 ci/run.sh                                     |   31 +-
 crates/assert-instr-macro/src/lib.rs          |    8 +-
 crates/core_arch/Cargo.toml                   |    3 -
 crates/core_arch/src/lib.rs                   |    9 +-
 crates/core_arch/src/mod.rs                   |    2 +
 crates/core_arch/src/wasm32/atomic.rs         |    2 -
 crates/core_arch/src/wasm32/memory.rs         |    2 -
 crates/core_arch/src/wasm32/mod.rs            |    2 -
 crates/core_arch/src/wasm32/simd128.rs        | 1690 ++++++++++-------
 crates/core_arch/tests/xcrate-macros.rs       |   18 +
 crates/std_detect/src/detect/mod.rs           |    2 +
 crates/std_detect/src/detect/os/other.rs      |    1 +
 crates/stdarch-test/Cargo.toml                |    6 +-
 crates/stdarch-test/src/lib.rs                |   12 -
 crates/stdarch-test/src/wasm.rs               |   48 +-
 examples/Cargo.toml                           |    6 +-
 examples/hex.rs                               |   85 +-
 examples/wasm.rs                              |    4 +-
 22 files changed, 1200 insertions(+), 797 deletions(-)
 delete mode 100644 ci/docker/wasm32-unknown-unknown/Dockerfile
 delete mode 100755 ci/docker/wasm32-unknown-unknown/wasm-entrypoint.sh
 create mode 100644 ci/docker/wasm32-wasi/Dockerfile
 create mode 100644 crates/core_arch/tests/xcrate-macros.rs

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c5ef15004c..cadfc38300 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -77,7 +77,7 @@ jobs:
         - mips64-unknown-linux-gnuabi64
         - mips64el-unknown-linux-gnuabi64
         - s390x-unknown-linux-gnu
-        - wasm32-unknown-unknown
+        - wasm32-wasi
         - i586-unknown-linux-gnu
         - x86_64-linux-android
         - arm-linux-androideabi
@@ -129,7 +129,7 @@ jobs:
           disable_assert_instr: true
         - target: s390x-unknown-linux-gnu
           os: ubuntu-latest
-        - target: wasm32-unknown-unknown
+        - target: wasm32-wasi
           os: ubuntu-latest
         - target: aarch64-unknown-linux-gnu
           os: ubuntu-latest
diff --git a/ci/docker/wasm32-unknown-unknown/Dockerfile b/ci/docker/wasm32-unknown-unknown/Dockerfile
deleted file mode 100644
index 7b2567bcc7..0000000000
--- a/ci/docker/wasm32-unknown-unknown/Dockerfile
+++ /dev/null
@@ -1,25 +0,0 @@
-FROM ubuntu:18.04
-
-RUN apt-get update -y && apt-get install -y --no-install-recommends \
-  ca-certificates \
-  clang \
-  cmake \
-  curl \
-  git \
-  libc6-dev \
-  make \
-  python \
-  python3 \
-  xz-utils
-
-# Install `wasm2wat`
-RUN git clone --recursive https://github.com/WebAssembly/wabt
-RUN make -C wabt -j$(nproc)
-ENV PATH=$PATH:/wabt/bin
-
-# Install `node`
-RUN curl https://nodejs.org/dist/v12.0.0/node-v12.0.0-linux-x64.tar.xz | tar xJf -
-ENV PATH=$PATH:/node-v12.0.0-linux-x64/bin
-
-COPY docker/wasm32-unknown-unknown/wasm-entrypoint.sh /wasm-entrypoint.sh
-ENTRYPOINT ["/wasm-entrypoint.sh"]
diff --git a/ci/docker/wasm32-unknown-unknown/wasm-entrypoint.sh b/ci/docker/wasm32-unknown-unknown/wasm-entrypoint.sh
deleted file mode 100755
index 9916d1cb22..0000000000
--- a/ci/docker/wasm32-unknown-unknown/wasm-entrypoint.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Download an appropriate version of wasm-bindgen based off of what's being used
-# in the lock file. Ideally we'd use `wasm-pack` at some point for this!
-version=$(grep -A 1 'name = "wasm-bindgen"' Cargo.lock | grep version)
-version=$(echo $version | awk '{print $3}' | sed 's/"//g')
-curl -L https://github.com/rustwasm/wasm-bindgen/releases/download/$version/wasm-bindgen-$version-x86_64-unknown-linux-musl.tar.gz \
-   | tar xzf - -C target
-export PATH=$PATH:`pwd`/target/wasm-bindgen-$version-x86_64-unknown-linux-musl
-export CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=wasm-bindgen-test-runner
-export NODE_ARGS=--experimental-wasm-simd
-
-exec "$@"
diff --git a/ci/docker/wasm32-wasi/Dockerfile b/ci/docker/wasm32-wasi/Dockerfile
new file mode 100644
index 0000000000..7b7d75190c
--- /dev/null
+++ b/ci/docker/wasm32-wasi/Dockerfile
@@ -0,0 +1,22 @@
+FROM ubuntu:20.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y && apt-get install -y --no-install-recommends \
+  ca-certificates \
+  clang \
+  cmake \
+  curl \
+  git \
+  libc6-dev \
+  make \
+  python \
+  python3 \
+  xz-utils
+
+RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/dev/wasmtime-dev-x86_64-linux.tar.xz | tar xJf -
+ENV PATH=$PATH:/wasmtime-dev-x86_64-linux
+
+ENV CARGO_TARGET_WASM32_WASI_RUNNER="wasmtime \
+  --enable-simd \
+  --mapdir .::/checkout/target/wasm32-wasi/release/deps \
+  --"
diff --git a/ci/run.sh b/ci/run.sh
index 682a38636c..2b7e51be3d 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -44,6 +44,16 @@ cargo_test() {
     fi
     cmd="$cmd ${subcmd} --target=$TARGET $1"
     cmd="$cmd -- $2"
+
+    # wasm targets can't catch panics so if a test failures make sure the test
+    # harness isn't trying to capture output, otherwise we won't get any useful
+    # output.
+    case ${TARGET} in
+        wasm32*)
+            cmd="$cmd --nocapture"
+            ;;
+    esac
+
     $cmd
 }
 
@@ -72,20 +82,11 @@ case ${TARGET} in
         export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
         cargo_test "--release"
         ;;
-    wasm32-unknown-unknown*)
-        # Attempt to actually run some SIMD tests in node.js. Unfortunately
-        # though node.js (transitively through v8) doesn't have support for the
-        # full SIMD spec yet, only some functions. As a result only pass in
-        # some target features and a special `--cfg`
-        # FIXME: broken
-        #export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+simd128 --cfg only_node_compatible_functions"
-        #cargo_test "--release"
-
-        # After that passes make sure that all intrinsics compile, passing in
-        # the extra feature to compile in non-node-compatible SIMD.
-        # FIXME: broken
-        #export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+simd128,+unimplemented-simd128"
-        #cargo_test "--release --no-run"
+    wasm32*)
+        prev="$RUSTFLAGS"
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+simd128,+unimplemented-simd128"
+        cargo_test "--release"
+        export RUSTFLAGS="$prev"
         ;;
     # FIXME: don't build anymore
     #mips-*gnu* | mipsel-*gnu*)
@@ -111,7 +112,7 @@ case ${TARGET} in
 
 esac
 
-if [ "$NORUN" != "1" ] && [ "$NOSTD" != 1 ] && [ "$TARGET" != "wasm32-unknown-unknown" ]; then
+if [ "$NORUN" != "1" ] && [ "$NOSTD" != 1 ]; then
     # Test examples
     (
         cd examples
diff --git a/crates/assert-instr-macro/src/lib.rs b/crates/assert-instr-macro/src/lib.rs
index 75fe9851ca..200f02fae5 100644
--- a/crates/assert-instr-macro/src/lib.rs
+++ b/crates/assert-instr-macro/src/lib.rs
@@ -131,8 +131,7 @@ pub fn assert_instr(
     };
 
     let tokens: TokenStream = quote! {
-        #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
-        #[cfg_attr(not(target_arch = "wasm32"), test)]
+        #[test]
         #[allow(non_snake_case)]
         fn #assert_name() {
             #to_test
@@ -146,11 +145,6 @@ pub fn assert_instr(
                                    #instr);
         }
     };
-    // why? necessary now to get tests to work?
-    let tokens: TokenStream = tokens
-        .to_string()
-        .parse()
-        .expect("cannot parse tokenstream");
 
     let tokens: TokenStream = quote! {
         #item
diff --git a/crates/core_arch/Cargo.toml b/crates/core_arch/Cargo.toml
index 72d89b0168..a25b20bf0c 100644
--- a/crates/core_arch/Cargo.toml
+++ b/crates/core_arch/Cargo.toml
@@ -26,8 +26,5 @@ maintenance = { status = "experimental" }
 stdarch-test = { version = "0.*", path = "../stdarch-test" }
 std_detect = { version = "0.*", path = "../std_detect" }
 
-[target.wasm32-unknown-unknown.dev-dependencies]
-wasm-bindgen-test = "0.2.47"
-
 [package.metadata.docs.rs]
 rustdoc-args = [ "--cfg", "dox" ]
diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
index fae4519a0e..97c82b5234 100644
--- a/crates/core_arch/src/lib.rs
+++ b/crates/core_arch/src/lib.rs
@@ -32,7 +32,9 @@
     adx_target_feature,
     rtm_target_feature,
     f16c_target_feature,
-    external_doc
+    external_doc,
+    allow_internal_unstable,
+    decl_macro
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall, untagged_unions))]
 #![deny(clippy::missing_inline_in_public_items)]
@@ -66,13 +68,10 @@ extern crate std_detect;
 #[cfg(test)]
 extern crate stdarch_test;
 
-#[cfg(all(test, target_arch = "wasm32"))]
-extern crate wasm_bindgen_test;
-
 #[path = "mod.rs"]
 mod core_arch;
 
-pub use self::core_arch::arch::*;
+pub use self::core_arch::arch;
 
 #[allow(unused_imports)]
 use core::{ffi, hint, intrinsics, marker, mem, ops, ptr, sync};
diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs
index 4ed18d7648..19f61affdd 100644
--- a/crates/core_arch/src/mod.rs
+++ b/crates/core_arch/src/mod.rs
@@ -1,5 +1,7 @@
 //! `core_arch`
 
+#![allow(improper_ctypes_definitions)]
+
 #[macro_use]
 mod macros;
 
diff --git a/crates/core_arch/src/wasm32/atomic.rs b/crates/core_arch/src/wasm32/atomic.rs
index b8ffaeac0e..024bf94a7f 100644
--- a/crates/core_arch/src/wasm32/atomic.rs
+++ b/crates/core_arch/src/wasm32/atomic.rs
@@ -10,8 +10,6 @@
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
-#[cfg(test)]
-use wasm_bindgen_test::wasm_bindgen_test;
 
 extern "C" {
     #[link_name = "llvm.wasm.atomic.wait.i32"]
diff --git a/crates/core_arch/src/wasm32/memory.rs b/crates/core_arch/src/wasm32/memory.rs
index 3df8abdee2..c4e801b738 100644
--- a/crates/core_arch/src/wasm32/memory.rs
+++ b/crates/core_arch/src/wasm32/memory.rs
@@ -1,7 +1,5 @@
 #[cfg(test)]
 use stdarch_test::assert_instr;
-#[cfg(test)]
-use wasm_bindgen_test::wasm_bindgen_test;
 
 extern "C" {
     #[link_name = "llvm.wasm.memory.grow.i32"]
diff --git a/crates/core_arch/src/wasm32/mod.rs b/crates/core_arch/src/wasm32/mod.rs
index 5e7a9d85f4..a8becb64ad 100644
--- a/crates/core_arch/src/wasm32/mod.rs
+++ b/crates/core_arch/src/wasm32/mod.rs
@@ -2,8 +2,6 @@
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
-#[cfg(test)]
-use wasm_bindgen_test::wasm_bindgen_test;
 
 #[cfg(any(target_feature = "atomics", dox))]
 mod atomic;
diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 5ac01a4fae..6c7599f4a2 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -4,6 +4,7 @@
 //! https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md
 
 #![allow(non_camel_case_types)]
+#![allow(unused_imports)]
 
 use crate::{
     core_arch::{simd::*, simd_llvm::*},
@@ -14,8 +15,6 @@ use crate::{
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
-#[cfg(test)]
-use wasm_bindgen_test::wasm_bindgen_test;
 
 types! {
     /// WASM-specific 128-bit wide SIMD vector type.
@@ -119,11 +118,6 @@ extern "C" {
     #[link_name = "llvm.wasm.alltrue.v4i32"]
     fn llvm_i32x4_all_true(x: i32x4) -> i32;
 
-    #[link_name = "llvm.wasm.anytrue.v2i64"]
-    fn llvm_i64x2_any_true(x: i64x2) -> i32;
-    #[link_name = "llvm.wasm.alltrue.v2i64"]
-    fn llvm_i64x2_all_true(x: i64x2) -> i32;
-
     #[link_name = "llvm.fabs.v4f32"]
     fn llvm_f32x4_abs(x: f32x4) -> f32x4;
     #[link_name = "llvm.sqrt.v4f32"]
@@ -143,6 +137,47 @@ extern "C" {
 
     #[link_name = "llvm.wasm.bitselect.v16i8"]
     fn llvm_bitselect(a: i8x16, b: i8x16, c: i8x16) -> i8x16;
+    #[link_name = "llvm.wasm.swizzle"]
+    fn llvm_swizzle(a: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.wasm.bitmask.v16i8"]
+    fn llvm_bitmask_i8x16(a: i8x16) -> i32;
+    #[link_name = "llvm.wasm.narrow.signed.v16i8.v8i16"]
+    fn llvm_narrow_i8x16_s(a: i16x8, b: i16x8) -> i8x16;
+    #[link_name = "llvm.wasm.narrow.unsigned.v16i8.v8i16"]
+    fn llvm_narrow_i8x16_u(a: i16x8, b: i16x8) -> i8x16;
+    #[link_name = "llvm.wasm.avgr.unsigned.v16i8"]
+    fn llvm_avgr_u_i8x16(a: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.wasm.bitmask.v8i16"]
+    fn llvm_bitmask_i16x8(a: i16x8) -> i32;
+    #[link_name = "llvm.wasm.narrow.signed.v8i16.v8i16"]
+    fn llvm_narrow_i16x8_s(a: i32x4, b: i32x4) -> i16x8;
+    #[link_name = "llvm.wasm.narrow.unsigned.v8i16.v8i16"]
+    fn llvm_narrow_i16x8_u(a: i32x4, b: i32x4) -> i16x8;
+    #[link_name = "llvm.wasm.avgr.unsigned.v8i16"]
+    fn llvm_avgr_u_i16x8(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.wasm.widen.low.signed.v8i16.v16i8"]
+    fn llvm_widen_low_i16x8_s(a: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.widen.high.signed.v8i16.v16i8"]
+    fn llvm_widen_high_i16x8_s(a: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.widen.low.unsigned.v8i16.v16i8"]
+    fn llvm_widen_low_i16x8_u(a: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.widen.high.unsigned.v8i16.v16i8"]
+    fn llvm_widen_high_i16x8_u(a: i8x16) -> i16x8;
+
+    #[link_name = "llvm.wasm.bitmask.v4i32"]
+    fn llvm_bitmask_i32x4(a: i32x4) -> i32;
+    #[link_name = "llvm.wasm.avgr.unsigned.v4i32"]
+    fn llvm_avgr_u_i32x4(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.wasm.widen.low.signed.v4i32.v8i16"]
+    fn llvm_widen_low_i32x4_s(a: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.widen.high.signed.v4i32.v8i16"]
+    fn llvm_widen_high_i32x4_s(a: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.widen.low.unsigned.v4i32.v8i16"]
+    fn llvm_widen_low_i32x4_u(a: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.widen.high.unsigned.v4i32.v8i16"]
+    fn llvm_widen_high_i32x4_u(a: i16x8) -> i32x4;
 }
 
 /// Loads a `v128` vector from the given heap address.
@@ -152,6 +187,80 @@ pub unsafe fn v128_load(m: *const v128) -> v128 {
     ptr::read(m)
 }
 
+/// Load eight 8-bit integers and sign extend each one to a 16-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.load8x8_s))]
+pub unsafe fn i16x8_load8x8_s(m: *const i8) -> v128 {
+    transmute(simd_cast::<_, i16x8>(ptr::read(m as *const i8x8)))
+}
+
+/// Load eight 8-bit integers and zero extend each one to a 16-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.load8x8_u))]
+pub unsafe fn i16x8_load8x8_u(m: *const u8) -> v128 {
+    transmute(simd_cast::<_, u16x8>(ptr::read(m as *const u8x8)))
+}
+
+/// Load four 16-bit integers and sign extend each one to a 32-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.load16x4_s))]
+pub unsafe fn i32x4_load16x4_s(m: *const i16) -> v128 {
+    transmute(simd_cast::<_, i32x4>(ptr::read(m as *const i16x4)))
+}
+
+/// Load four 16-bit integers and zero extend each one to a 32-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.load16x4_u))]
+pub unsafe fn i32x4_load16x4_u(m: *const u16) -> v128 {
+    transmute(simd_cast::<_, u32x4>(ptr::read(m as *const u16x4)))
+}
+
+/// Load two 32-bit integers and sign extend each one to a 64-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.load32x2_s))]
+pub unsafe fn i64x2_load32x2_s(m: *const i32) -> v128 {
+    transmute(simd_cast::<_, i64x2>(ptr::read(m as *const i32x2)))
+}
+
+/// Load two 32-bit integers and zero extend each one to a 64-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.load32x2_u))]
+pub unsafe fn i64x2_load32x2_u(m: *const u32) -> v128 {
+    transmute(simd_cast::<_, u64x2>(ptr::read(m as *const u32x2)))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v8x16.load_splat))]
+pub unsafe fn v8x16_load_splat(m: *const u8) -> v128 {
+    let v = *m;
+    transmute(u8x16(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v16x8.load_splat))]
+pub unsafe fn v16x8_load_splat(m: *const u16) -> v128 {
+    let v = *m;
+    transmute(u16x8(v, v, v, v, v, v, v, v))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v32x4.load_splat))]
+pub unsafe fn v32x4_load_splat(m: *const u32) -> v128 {
+    let v = *m;
+    transmute(u32x4(v, v, v, v))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v64x2.load_splat))]
+pub unsafe fn v64x2_load_splat(m: *const u64) -> v128 {
+    let v = *m;
+    transmute(u64x2(v, v))
+}
+
 /// Stores a `v128` vector to the given heap address.
 #[inline]
 #[cfg_attr(test, assert_instr(v128.store))]
@@ -164,7 +273,6 @@ pub unsafe fn v128_store(m: *mut v128, a: v128) {
 /// The `v128.const` instruction is encoded with 16 immediate bytes
 /// `imm` which provide the bits of the vector directly.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[rustc_args_required_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)]
 #[cfg_attr(test, assert_instr(
     v128.const,
@@ -217,6 +325,103 @@ pub const fn v128_const(
     }
 }
 
+/// Returns a new vector with lanes selected from the lanes of the two input
+/// vectors `$a` and `$b` specified in the 16 immediate operands.
+///
+/// The `$a` and `$b` expressions must have type `v128`, and this macro
+/// generates a wasm instruction that is encoded with 16 bytes providing the
+/// indices of the elements to return. The indices `i` in range [0, 15] select
+/// the `i`-th element of `a`. The indices in range [16, 31] select the `i -
+/// 16`-th element of `b`.
+///
+/// Note that this is a macro due to the codegen requirements of all of the
+/// index expressions `$i*` must be constant. A compiler error will be
+/// generated if any of the expressions are not constant.
+///
+/// All indexes `$i*` must have the type `u32`.
+#[allow_internal_unstable(platform_intrinsics, rustc_attrs)]
+pub macro v8x16_shuffle(
+    $a:expr, $b:expr,
+    $i0:expr,
+    $i1:expr,
+    $i2:expr,
+    $i3:expr,
+    $i4:expr,
+    $i5:expr,
+    $i6:expr,
+    $i7:expr,
+    $i8:expr,
+    $i9:expr,
+    $i10:expr,
+    $i11:expr,
+    $i12:expr,
+    $i13:expr,
+    $i14:expr,
+    $i15:expr $(,)?
+) {{
+    extern "platform-intrinsic" {
+        #[rustc_args_required_const(2)]
+        pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
+    }
+    let shuf = simd_shuffle16::<
+        $crate::arch::wasm32::__v8x16_shuffle_u8x16,
+        $crate::arch::wasm32::__v8x16_shuffle_u8x16,
+    >(
+        $crate::arch::wasm32::__v8x16_shuffle_transmute::<
+            $crate::arch::wasm32::v128,
+            $crate::arch::wasm32::__v8x16_shuffle_u8x16,
+        >($a),
+        $crate::arch::wasm32::__v8x16_shuffle_transmute::<
+            $crate::arch::wasm32::v128,
+            $crate::arch::wasm32::__v8x16_shuffle_u8x16,
+        >($b),
+        [
+            $i0, $i1, $i2, $i3, $i4, $i5, $i6, $i7, $i8, $i9, $i10, $i11, $i12, $i13, $i14, $i15,
+        ],
+    );
+    $crate::arch::wasm32::__v8x16_shuffle_transmute::<
+        $crate::arch::wasm32::__v8x16_shuffle_u8x16,
+        $crate::arch::wasm32::v128,
+    >(shuf)
+}}
+
+// internal implementation detail of the `v8x16_shuffle`, done so there's a name
+// that always works for the macro to import.
+pub use crate::mem::transmute as __v8x16_shuffle_transmute;
+
+// internal to this module and only generated here as an implementation detail
+// of the `v8x16_shuffle` macro.
+#[repr(simd)]
+pub struct __v8x16_shuffle_u8x16(
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+);
+
+/// Returns a new vector with lanes selected from the lanes of the first input
+/// vector `a` specified in the second input vector `s`.
+///
+/// The indices `i` in range [0, 15] select the `i`-th element of `a`. For
+/// indices outside of the range the resulting lane is 0.
+#[inline]
+#[cfg_attr(test, assert_instr(v8x16.swizzle))]
+pub fn v8x16_swizzle(a: v128, s: v128) -> v128 {
+    unsafe { transmute(llvm_swizzle(transmute(a), transmute(s))) }
+}
+
 /// Creates a vector with identical lanes.
 ///
 /// Constructs a vector with `x` replicated to all 16 lanes.
@@ -226,6 +431,51 @@ pub fn i8x16_splat(a: i8) -> v128 {
     unsafe { transmute(i8x16::splat(a)) }
 }
 
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 8 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.splat))]
+pub fn i16x8_splat(a: i16) -> v128 {
+    unsafe { transmute(i16x8::splat(a)) }
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.splat))]
+pub fn i32x4_splat(a: i32) -> v128 {
+    unsafe { transmute(i32x4::splat(a)) }
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.splat))]
+pub fn i64x2_splat(a: i64) -> v128 {
+    unsafe { transmute(i64x2::splat(a)) }
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.splat))]
+pub fn f32x4_splat(a: f32) -> v128 {
+    unsafe { transmute(f32x4::splat(a)) }
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.splat))]
+pub fn f64x2_splat(a: f64) -> v128 {
+    unsafe { transmute(f64x2::splat(a)) }
+}
+
 /// Extracts a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
@@ -238,20 +488,21 @@ pub fn i8x16_splat(a: i8) -> v128 {
 #[inline]
 #[rustc_args_required_const(1)]
 pub unsafe fn i8x16_extract_lane(a: v128, imm: usize) -> i8 {
-    #[cfg(test)]
-    #[assert_instr(i8x16.extract_lane_s)]
-    fn extract_lane_s(a: v128) -> i32 {
-        unsafe { i8x16_extract_lane(a, 0) as i32 }
-    }
-    #[cfg(test)]
-    #[cfg(not(only_node_compatible_functions))]
-    #[assert_instr(i8x16.extract_lane_u)]
-    fn extract_lane_u(a: v128) -> u32 {
-        unsafe { i8x16_extract_lane(a, 0) as u32 }
-    }
     simd_extract(a.as_i8x16(), imm as u32)
 }
 
+#[cfg(test)]
+#[assert_instr(i8x16.extract_lane_s)]
+fn i8x16_extract_lane_s(a: v128) -> i32 {
+    unsafe { i8x16_extract_lane(a, 0) as i32 }
+}
+
+#[cfg(test)]
+#[assert_instr(i8x16.extract_lane_u)]
+fn i8x16_extract_lane_u(a: v128) -> u32 {
+    unsafe { i8x16_extract_lane(a, 0) as u8 as u32 }
+}
+
 /// Replaces a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
 ///
 /// Replaces the scalar value of lane specified in the immediate mode operand
@@ -268,15 +519,6 @@ pub unsafe fn i8x16_replace_lane(a: v128, imm: usize, val: i8) -> v128 {
     transmute(simd_insert(a.as_i8x16(), imm as u32, val))
 }
 
-/// Creates a vector with identical lanes.
-///
-/// Construct a vector with `x` replicated to all 8 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(i16x8.splat))]
-pub fn i16x8_splat(a: i16) -> v128 {
-    unsafe { transmute(i16x8::splat(a)) }
-}
-
 /// Extracts a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
 ///
 /// Extracts a the scalar value of lane specified in the immediate mode operand
@@ -289,20 +531,21 @@ pub fn i16x8_splat(a: i16) -> v128 {
 #[inline]
 #[rustc_args_required_const(1)]
 pub unsafe fn i16x8_extract_lane(a: v128, imm: usize) -> i16 {
-    #[cfg(test)]
-    #[assert_instr(i16x8.extract_lane_s)]
-    fn extract_lane_s(a: v128) -> i32 {
-        unsafe { i16x8_extract_lane(a, 0) as i32 }
-    }
-    #[cfg(test)]
-    #[cfg(not(only_node_compatible_functions))]
-    #[assert_instr(i16x8.extract_lane_u)]
-    fn extract_lane_u(a: v128) -> u32 {
-        unsafe { i16x8_extract_lane(a, 0) as u32 }
-    }
     simd_extract(a.as_i16x8(), imm as u32)
 }
 
+#[cfg(test)]
+#[assert_instr(i16x8.extract_lane_s)]
+fn i16x8_extract_lane_s(a: v128) -> i32 {
+    unsafe { i16x8_extract_lane(a, 0) as i32 }
+}
+
+#[cfg(test)]
+#[assert_instr(i16x8.extract_lane_u)]
+fn i16x8_extract_lane_u(a: v128) -> u32 {
+    unsafe { i16x8_extract_lane(a, 0) as u16 as u32 }
+}
+
 /// Replaces a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
 ///
 /// Replaces the scalar value of lane specified in the immediate mode operand
@@ -319,15 +562,6 @@ pub unsafe fn i16x8_replace_lane(a: v128, imm: usize, val: i16) -> v128 {
     transmute(simd_insert(a.as_i16x8(), imm as u32, val))
 }
 
-/// Creates a vector with identical lanes.
-///
-/// Constructs a vector with `x` replicated to all 4 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(i32x4.splat))]
-pub fn i32x4_splat(a: i32) -> v128 {
-    unsafe { transmute(i32x4::splat(a)) }
-}
-
 /// Extracts a lane from a 128-bit vector interpreted as 4 packed i32 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
@@ -360,16 +594,6 @@ pub unsafe fn i32x4_replace_lane(a: v128, imm: usize, val: i32) -> v128 {
     transmute(simd_insert(a.as_i32x4(), imm as u32, val))
 }
 
-/// Creates a vector with identical lanes.
-///
-/// Construct a vector with `x` replicated to all 2 lanes.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i8x16.splat))]
-pub fn i64x2_splat(a: i64) -> v128 {
-    unsafe { transmute(i64x2::splat(a)) }
-}
-
 /// Extracts a lane from a 128-bit vector interpreted as 2 packed i64 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
@@ -380,8 +604,7 @@ pub fn i64x2_splat(a: i64) -> v128 {
 /// This function has undefined behavior if `imm` is greater than or equal to
 /// 2.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i64x2.extract_lane_s, imm = 0))]
+#[cfg_attr(test, assert_instr(i64x2.extract_lane, imm = 0))]
 #[rustc_args_required_const(1)]
 pub unsafe fn i64x2_extract_lane(a: v128, imm: usize) -> i64 {
     simd_extract(a.as_i64x2(), imm as u32)
@@ -397,22 +620,12 @@ pub unsafe fn i64x2_extract_lane(a: v128, imm: usize) -> i64 {
 /// This function has undefined behavior if `imm` is greater than or equal to
 /// 2.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i64x2.replace_lane, imm = 0))]
 #[rustc_args_required_const(1)]
 pub unsafe fn i64x2_replace_lane(a: v128, imm: usize, val: i64) -> v128 {
     transmute(simd_insert(a.as_i64x2(), imm as u32, val))
 }
 
-/// Creates a vector with identical lanes.
-///
-/// Constructs a vector with `x` replicated to all 4 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(f32x4.splat))]
-pub fn f32x4_splat(a: f32) -> v128 {
-    unsafe { transmute(f32x4::splat(a)) }
-}
-
 /// Extracts a lane from a 128-bit vector interpreted as 4 packed f32 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
@@ -445,16 +658,6 @@ pub unsafe fn f32x4_replace_lane(a: v128, imm: usize, val: f32) -> v128 {
     transmute(simd_insert(a.as_f32x4(), imm as u32, val))
 }
 
-/// Creates a vector with identical lanes.
-///
-/// Constructs a vector with `x` replicated to all 2 lanes.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(f64x2.splat))]
-pub fn f64x2_splat(a: f64) -> v128 {
-    unsafe { transmute(f64x2::splat(a)) }
-}
-
 /// Extracts lane from a 128-bit vector interpreted as 2 packed f64 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
@@ -465,8 +668,7 @@ pub fn f64x2_splat(a: f64) -> v128 {
 /// This function has undefined behavior if `imm` is greater than or equal to
 /// 2.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(f64x2.extract_lane_s, imm = 0))]
+#[cfg_attr(test, assert_instr(f64x2.extract_lane, imm = 0))]
 #[rustc_args_required_const(1)]
 pub unsafe fn f64x2_extract_lane(a: v128, imm: usize) -> f64 {
     simd_extract(a.as_f64x2(), imm as u32)
@@ -482,7 +684,6 @@ pub unsafe fn f64x2_extract_lane(a: v128, imm: usize) -> f64 {
 /// This function has undefined behavior if `imm` is greater than or equal to
 /// 2.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.replace_lane, imm = 0))]
 #[rustc_args_required_const(1)]
 pub unsafe fn f64x2_replace_lane(a: v128, imm: usize, val: f64) -> v128 {
@@ -891,7 +1092,6 @@ pub fn f32x4_ge(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise elements
 /// were equal, or all zeros if the elements were not equal.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.eq))]
 pub fn f64x2_eq(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_eq::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -903,7 +1103,6 @@ pub fn f64x2_eq(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise elements
 /// were not equal, or all zeros if the elements were equal.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.ne))]
 pub fn f64x2_ne(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_ne::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -915,7 +1114,6 @@ pub fn f64x2_ne(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise left
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.lt))]
 pub fn f64x2_lt(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_lt::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -927,7 +1125,6 @@ pub fn f64x2_lt(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise left
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.gt))]
 pub fn f64x2_gt(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_gt::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -939,7 +1136,6 @@ pub fn f64x2_gt(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise left
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.le))]
 pub fn f64x2_le(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_le::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -951,7 +1147,6 @@ pub fn f64x2_le(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise left
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.ge))]
 pub fn f64x2_ge(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_ge::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -972,6 +1167,20 @@ pub fn v128_and(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_and(a.as_i64x2(), b.as_i64x2())) }
 }
 
+/// Bitwise AND of bits of `a` and the logical inverse of bits of `b`.
+///
+/// This operation is equivalent to `v128.and(a, v128.not(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(v128.andnot))]
+pub fn v128_andnot(a: v128, b: v128) -> v128 {
+    unsafe {
+        transmute(simd_and(
+            a.as_i64x2(),
+            simd_xor(b.as_i64x2(), i64x2(-1, -1)),
+        ))
+    }
+}
+
 /// Performs a bitwise or of the two input 128-bit vectors, returning the
 /// resulting vector.
 #[inline]
@@ -992,7 +1201,22 @@ pub fn v128_xor(a: v128, b: v128) -> v128 {
 #[inline]
 #[cfg_attr(test, assert_instr(v128.bitselect))]
 pub fn v128_bitselect(v1: v128, v2: v128, c: v128) -> v128 {
-    unsafe { transmute(llvm_bitselect(c.as_i8x16(), v1.as_i8x16(), v2.as_i8x16())) }
+    unsafe { transmute(llvm_bitselect(v1.as_i8x16(), v2.as_i8x16(), c.as_i8x16())) }
+}
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+// #[cfg_attr(test, assert_instr(i8x16.abs))] // FIXME support not in our LLVM yet
+pub fn i8x16_abs(a: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i8x16>(a);
+        let zero = i8x16::splat(0);
+        transmute(simd_select::<m8x16, i8x16>(
+            simd_lt(a, zero),
+            simd_sub(zero, a),
+            a,
+        ))
+    }
 }
 
 /// Negates a 128-bit vectors intepreted as sixteen 8-bit signed integers
@@ -1016,12 +1240,42 @@ pub fn i8x16_all_true(a: v128) -> i32 {
     unsafe { llvm_i8x16_all_true(a.as_i8x16()) }
 }
 
+// FIXME: not available in our LLVM yet
+// /// Extracts the high bit for each lane in `a` and produce a scalar mask with
+// /// all bits concatenated.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i8x16.all_true))]
+// pub fn i8x16_bitmask(a: v128) -> i32 {
+//     unsafe { llvm_bitmask_i8x16(transmute(a)) }
+// }
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x7f or 0x80 is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.narrow_i16x8_s))]
+pub fn i8x16_narrow_i16x8_s(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_narrow_i8x16_s(transmute(a), transmute(b))) }
+}
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x00 or 0xff is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.narrow_i16x8_u))]
+pub fn i8x16_narrow_i16x8_u(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_narrow_i8x16_u(transmute(a), transmute(b))) }
+}
+
 /// Shifts each lane to the left by the specified number of bits.
 ///
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i8x16.shl))]
 pub fn i8x16_shl(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shl(a.as_i8x16(), i8x16::splat(amt as i8))) }
@@ -1033,8 +1287,7 @@ pub fn i8x16_shl(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i8x16.shl))]
+#[cfg_attr(test, assert_instr(i8x16.shr_s))]
 pub fn i8x16_shr_s(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_i8x16(), i8x16::splat(amt as i8))) }
 }
@@ -1045,8 +1298,7 @@ pub fn i8x16_shr_s(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i8x16.shl))]
+#[cfg_attr(test, assert_instr(i8x16.shr_u))]
 pub fn i8x16_shr_u(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_u8x16(), u8x16::splat(amt as u8))) }
 }
@@ -1097,12 +1349,74 @@ pub fn i8x16_sub_saturate_u(a: v128, b: v128) -> v128 {
     unsafe { transmute(llvm_i8x16_sub_saturate_u(a.as_i8x16(), b.as_i8x16())) }
 }
 
-/// Multiplies two 128-bit vectors as if they were two packed sixteen 8-bit
-/// signed integers.
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.min_s))]
+pub fn i8x16_min_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.min_u))]
+pub fn i8x16_min_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u8x16>(a);
+        let b = transmute::<_, u8x16>(b);
+        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.max_s))]
+pub fn i8x16_max_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i8x16>(a);
+        let b = transmute::<_, i8x16>(b);
+        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.max_u))]
+pub fn i8x16_max_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u8x16>(a);
+        let b = transmute::<_, u8x16>(b);
+        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Lane-wise rounding average.
 #[inline]
-#[cfg_attr(test, assert_instr(i8x16.mul))]
-pub fn i8x16_mul(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i8x16(), b.as_i8x16())) }
+#[cfg_attr(test, assert_instr(i8x16.avgr_u))]
+pub fn i8x16_avgr_u(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_avgr_u_i8x16(transmute(a), transmute(b))) }
+}
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+// #[cfg_attr(test, assert_instr(i16x8.abs))] // FIXME support not in our LLVM yet
+pub fn i16x8_abs(a: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i16x8>(a);
+        let zero = i16x8::splat(0);
+        transmute(simd_select::<m16x8, i16x8>(
+            simd_lt(a, zero),
+            simd_sub(zero, a),
+            a,
+        ))
+    }
 }
 
 /// Negates a 128-bit vectors intepreted as eight 16-bit signed integers
@@ -1126,12 +1440,75 @@ pub fn i16x8_all_true(a: v128) -> i32 {
     unsafe { llvm_i16x8_all_true(a.as_i16x8()) }
 }
 
+// FIXME: not available in our LLVM yet
+// /// Extracts the high bit for each lane in `a` and produce a scalar mask with
+// /// all bits concatenated.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i16x8.all_true))]
+// pub fn i16x8_bitmask(a: v128) -> i32 {
+//     unsafe { llvm_bitmask_i16x8(transmute(a)) }
+// }
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x7fff or 0x8000 is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.narrow_i32x4_s))]
+pub fn i16x8_narrow_i32x4_s(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_narrow_i16x8_s(transmute(a), transmute(b))) }
+}
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x0000 or 0xffff is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.narrow_i32x4_u))]
+pub fn i16x8_narrow_i32x4_u(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_narrow_i16x8_u(transmute(a), transmute(b))) }
+}
+
+// FIXME waiting on a runtime implementation to test
+// /// Converts low half of the smaller lane vector to a larger lane
+// /// vector, sign extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_s))]
+// pub fn i16x8_widen_low_i8x16_s(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_low_i16x8_s(transmute(a))) }
+// }
+
+// /// Converts high half of the smaller lane vector to a larger lane
+// /// vector, sign extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_s))]
+// pub fn i16x8_widen_high_i8x16_s(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_high_i16x8_s(transmute(a))) }
+// }
+
+// /// Converts low half of the smaller lane vector to a larger lane
+// /// vector, zero extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_u))]
+// pub fn i16x8_widen_low_i8x16_u(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_low_i16x8_u(transmute(a))) }
+// }
+
+// /// Converts high half of the smaller lane vector to a larger lane
+// /// vector, zero extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_u))]
+// pub fn i16x8_widen_high_i8x16_u(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_high_i16x8_u(transmute(a))) }
+// }
+
 /// Shifts each lane to the left by the specified number of bits.
 ///
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i16x8.shl))]
 pub fn i16x8_shl(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shl(a.as_i16x8(), i16x8::splat(amt as i16))) }
@@ -1143,8 +1520,7 @@ pub fn i16x8_shl(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i16x8.shl))]
+#[cfg_attr(test, assert_instr(i16x8.shr_s))]
 pub fn i16x8_shr_s(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(amt as i16))) }
 }
@@ -1155,8 +1531,7 @@ pub fn i16x8_shr_s(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i16x8.shl))]
+#[cfg_attr(test, assert_instr(i16x8.shr_u))]
 pub fn i16x8_shr_u(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_u16x8(), u16x8::splat(amt as u16))) }
 }
@@ -1215,6 +1590,76 @@ pub fn i16x8_mul(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
 }
 
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.min_s))]
+pub fn i16x8_min_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i16x8>(a);
+        let b = transmute::<_, i16x8>(b);
+        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.min_u))]
+pub fn i16x8_min_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u16x8>(a);
+        let b = transmute::<_, u16x8>(b);
+        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.max_s))]
+pub fn i16x8_max_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i16x8>(a);
+        let b = transmute::<_, i16x8>(b);
+        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.max_u))]
+pub fn i16x8_max_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u16x8>(a);
+        let b = transmute::<_, u16x8>(b);
+        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Lane-wise rounding average.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.avgr_u))]
+pub fn i16x8_avgr_u(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_avgr_u_i16x8(transmute(a), transmute(b))) }
+}
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.abs))] // FIXME support not in our LLVM yet
+pub fn i32x4_abs(a: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i32x4>(a);
+        let zero = i32x4::splat(0);
+        transmute(simd_select::<m32x4, i32x4>(
+            simd_lt(a, zero),
+            simd_sub(zero, a),
+            a,
+        ))
+    }
+}
+
 /// Negates a 128-bit vectors intepreted as four 32-bit signed integers
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.neg))]
@@ -1236,12 +1681,53 @@ pub fn i32x4_all_true(a: v128) -> i32 {
     unsafe { llvm_i32x4_all_true(a.as_i32x4()) }
 }
 
+// FIXME: not available in our LLVM yet
+// /// Extracts the high bit for each lane in `a` and produce a scalar mask with
+// /// all bits concatenated.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i32x4.all_true))]
+// pub fn i32x4_bitmask(a: v128) -> i32 {
+//     unsafe { llvm_bitmask_i32x4(transmute(a)) }
+// }
+
+// FIXME waiting on a runtime implementation to test
+// /// Converts low half of the smaller lane vector to a larger lane
+// /// vector, sign extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_s))]
+// pub fn i32x4_widen_low_i16x8_s(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_low_i32x4_s(transmute(a))) }
+// }
+
+// /// Converts high half of the smaller lane vector to a larger lane
+// /// vector, sign extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_s))]
+// pub fn i32x4_widen_high_i16x8_s(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_high_i32x4_s(transmute(a))) }
+// }
+
+// /// Converts low half of the smaller lane vector to a larger lane
+// /// vector, zero extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_u))]
+// pub fn i32x4_widen_low_i16x8_u(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_low_i32x4_u(transmute(a))) }
+// }
+
+// /// Converts high half of the smaller lane vector to a larger lane
+// /// vector, zero extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_u))]
+// pub fn i32x4_widen_high_i16x8_u(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_high_i32x4_u(transmute(a))) }
+// }
+
 /// Shifts each lane to the left by the specified number of bits.
 ///
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i32x4.shl))]
 pub fn i32x4_shl(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shl(a.as_i32x4(), i32x4::splat(amt as i32))) }
@@ -1253,8 +1739,7 @@ pub fn i32x4_shl(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i32x4.shl))]
+#[cfg_attr(test, assert_instr(i32x4.shr_s))]
 pub fn i32x4_shr_s(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(amt as i32))) }
 }
@@ -1265,8 +1750,7 @@ pub fn i32x4_shr_s(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i32x4.shl))]
+#[cfg_attr(test, assert_instr(i32x4.shr_u))]
 pub fn i32x4_shr_u(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_u32x4(), u32x4::splat(amt as u32))) }
 }
@@ -1293,28 +1777,59 @@ pub fn i32x4_mul(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) }
 }
 
-/// Negates a 128-bit vectors intepreted as two 64-bit signed integers
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i32x4.neg))]
-pub fn i64x2_neg(a: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i64x2(), i64x2::splat(-1))) }
+#[cfg_attr(test, assert_instr(i32x4.min_s))]
+pub fn i32x4_min_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i32x4>(a);
+        let b = transmute::<_, i32x4>(b);
+        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
+    }
 }
 
-/// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i64x2.any_true))]
-pub fn i64x2_any_true(a: v128) -> i32 {
-    unsafe { llvm_i64x2_any_true(a.as_i64x2()) }
+#[cfg_attr(test, assert_instr(i32x4.min_u))]
+pub fn i32x4_min_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u32x4>(a);
+        let b = transmute::<_, u32x4>(b);
+        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
+    }
 }
 
-/// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i64x2.all_true))]
-pub fn i64x2_all_true(a: v128) -> i32 {
-    unsafe { llvm_i64x2_all_true(a.as_i64x2()) }
+#[cfg_attr(test, assert_instr(i32x4.max_s))]
+pub fn i32x4_max_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i32x4>(a);
+        let b = transmute::<_, i32x4>(b);
+        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.max_u))]
+pub fn i32x4_max_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u32x4>(a);
+        let b = transmute::<_, u32x4>(b);
+        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Negates a 128-bit vectors intepreted as two 64-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.neg))]
+pub fn i64x2_neg(a: v128) -> v128 {
+    unsafe { transmute(simd_mul(a.as_i64x2(), i64x2::splat(-1))) }
 }
 
 /// Shifts each lane to the left by the specified number of bits.
@@ -1322,7 +1837,6 @@ pub fn i64x2_all_true(a: v128) -> i32 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i64x2.shl))]
 pub fn i64x2_shl(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shl(a.as_i64x2(), i64x2::splat(amt as i64))) }
@@ -1334,8 +1848,7 @@ pub fn i64x2_shl(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i64x2.shl))]
+#[cfg_attr(test, assert_instr(i64x2.shr_s))]
 pub fn i64x2_shr_s(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_i64x2(), i64x2::splat(amt as i64))) }
 }
@@ -1346,15 +1859,13 @@ pub fn i64x2_shr_s(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i64x2.shl))]
+#[cfg_attr(test, assert_instr(i64x2.shr_u))]
 pub fn i64x2_shr_u(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_u64x2(), u64x2::splat(amt as u64))) }
 }
 
 /// Adds two 128-bit vectors as if they were two packed two 64-bit integers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i64x2.add))]
 pub fn i64x2_add(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
@@ -1362,12 +1873,18 @@ pub fn i64x2_add(a: v128, b: v128) -> v128 {
 
 /// Subtracts two 128-bit vectors as if they were two packed two 64-bit integers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i64x2.sub))]
 pub fn i64x2_sub(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
 }
 
+/// Multiplies two 128-bit vectors as if they were two packed two 64-bit integers.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.mul))] // FIXME: not present in our LLVM
+pub fn i64x2_mul(a: v128, b: v128) -> v128 {
+    unsafe { transmute(simd_mul(a.as_i64x2(), b.as_i64x2())) }
+}
+
 /// Calculates the absolute value of each lane of a 128-bit vector interpreted
 /// as four 32-bit floating point numbers.
 #[inline]
@@ -1387,7 +1904,6 @@ pub fn f32x4_neg(a: v128) -> v128 {
 /// Calculates the square root of each lane of a 128-bit vector interpreted as
 /// four 32-bit floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f32x4.sqrt))]
 pub fn f32x4_sqrt(a: v128) -> v128 {
     unsafe { transmute(llvm_f32x4_sqrt(a.as_f32x4())) }
@@ -1420,7 +1936,6 @@ pub fn f32x4_mul(a: v128, b: v128) -> v128 {
 /// Divides pairwise lanes of two 128-bit vectors interpreted as four 32-bit
 /// floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f32x4.div))]
 pub fn f32x4_div(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_div(a.as_f32x4(), b.as_f32x4())) }
@@ -1445,7 +1960,6 @@ pub fn f32x4_max(a: v128, b: v128) -> v128 {
 /// Calculates the absolute value of each lane of a 128-bit vector interpreted
 /// as two 64-bit floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.abs))]
 pub fn f64x2_abs(a: v128) -> v128 {
     unsafe { transmute(llvm_f64x2_abs(a.as_f64x2())) }
@@ -1454,8 +1968,7 @@ pub fn f64x2_abs(a: v128) -> v128 {
 /// Negates each lane of a 128-bit vector interpreted as two 64-bit floating
 /// point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(f64x2.abs))]
+#[cfg_attr(test, assert_instr(f64x2.neg))]
 pub fn f64x2_neg(a: v128) -> v128 {
     unsafe { f64x2_mul(a, transmute(f64x2(-1.0, -1.0))) }
 }
@@ -1463,7 +1976,6 @@ pub fn f64x2_neg(a: v128) -> v128 {
 /// Calculates the square root of each lane of a 128-bit vector interpreted as
 /// two 64-bit floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.sqrt))]
 pub fn f64x2_sqrt(a: v128) -> v128 {
     unsafe { transmute(llvm_f64x2_sqrt(a.as_f64x2())) }
@@ -1472,7 +1984,6 @@ pub fn f64x2_sqrt(a: v128) -> v128 {
 /// Adds pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.add))]
 pub fn f64x2_add(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_add(a.as_f64x2(), b.as_f64x2())) }
@@ -1481,7 +1992,6 @@ pub fn f64x2_add(a: v128, b: v128) -> v128 {
 /// Subtracts pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.sub))]
 pub fn f64x2_sub(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_sub(a.as_f64x2(), b.as_f64x2())) }
@@ -1490,7 +2000,6 @@ pub fn f64x2_sub(a: v128, b: v128) -> v128 {
 /// Multiplies pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.mul))]
 pub fn f64x2_mul(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_mul(a.as_f64x2(), b.as_f64x2())) }
@@ -1499,7 +2008,6 @@ pub fn f64x2_mul(a: v128, b: v128) -> v128 {
 /// Divides pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.div))]
 pub fn f64x2_div(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_div(a.as_f64x2(), b.as_f64x2())) }
@@ -1508,7 +2016,6 @@ pub fn f64x2_div(a: v128, b: v128) -> v128 {
 /// Calculates the minimum of pairwise lanes of two 128-bit vectors interpreted
 /// as two 64-bit floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.min))]
 pub fn f64x2_min(a: v128, b: v128) -> v128 {
     unsafe { transmute(llvm_f64x2_min(a.as_f64x2(), b.as_f64x2())) }
@@ -1517,7 +2024,6 @@ pub fn f64x2_min(a: v128, b: v128) -> v128 {
 /// Calculates the maximum of pairwise lanes of two 128-bit vectors interpreted
 /// as two 64-bit floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.max))]
 pub fn f64x2_max(a: v128, b: v128) -> v128 {
     unsafe { transmute(llvm_f64x2_max(a.as_f64x2(), b.as_f64x2())) }
@@ -1530,7 +2036,7 @@ pub fn f64x2_max(a: v128, b: v128) -> v128 {
 /// representable intger.
 #[inline]
 #[cfg_attr(test, assert_instr("i32x4.trunc_sat_f32x4_s"))]
-pub fn i32x4_trunc_s_f32x4_sat(a: v128) -> v128 {
+pub fn i32x4_trunc_sat_f32x4_s(a: v128) -> v128 {
     unsafe { transmute(simd_cast::<_, i32x4>(a.as_f32x4())) }
 }
 
@@ -1541,34 +2047,10 @@ pub fn i32x4_trunc_s_f32x4_sat(a: v128) -> v128 {
 /// representable intger.
 #[inline]
 #[cfg_attr(test, assert_instr("i32x4.trunc_sat_f32x4_u"))]
-pub fn i32x4_trunc_u_f32x4_sat(a: v128) -> v128 {
+pub fn i32x4_trunc_sat_f32x4_su(a: v128) -> v128 {
     unsafe { transmute(simd_cast::<_, u32x4>(a.as_f32x4())) }
 }
 
-/// Converts a 128-bit vector interpreted as two 64-bit floating point numbers
-/// into a 128-bit vector of two 64-bit signed integers.
-///
-/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
-/// representable intger.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr("i64x2.trunc_s/f64x2:sat"))]
-pub fn i64x2_trunc_s_f64x2_sat(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, i64x2>(a.as_f64x2())) }
-}
-
-/// Converts a 128-bit vector interpreted as two 64-bit floating point numbers
-/// into a 128-bit vector of two 64-bit unsigned integers.
-///
-/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
-/// representable intger.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr("i64x2.trunc_u/f64x2:sat"))]
-pub fn i64x2_trunc_u_f64x2_sat(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, u64x2>(a.as_f64x2())) }
-}
-
 /// Converts a 128-bit vector interpreted as four 32-bit signed integers into a
 /// 128-bit vector of four 32-bit floating point numbers.
 #[inline]
@@ -1585,24 +2067,6 @@ pub fn f32x4_convert_i32x4_u(a: v128) -> v128 {
     unsafe { transmute(simd_cast::<_, f32x4>(a.as_u32x4())) }
 }
 
-/// Converts a 128-bit vector interpreted as two 64-bit signed integers into a
-/// 128-bit vector of two 64-bit floating point numbers.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr("f64x2.convert_s/i64x2"))]
-pub fn f64x2_convert_s_i64x2(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, f64x2>(a.as_i64x2())) }
-}
-
-/// Converts a 128-bit vector interpreted as two 64-bit unsigned integers into a
-/// 128-bit vector of two 64-bit floating point numbers.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr("f64x2.convert_u/i64x2"))]
-pub fn f64x2_convert_u_i64x2(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, f64x2>(a.as_u64x2())) }
-}
-
 #[cfg(test)]
 pub mod tests {
     use super::*;
@@ -1610,7 +2074,6 @@ pub mod tests {
     use std::mem;
     use std::num::Wrapping;
     use std::prelude::v1::*;
-    use wasm_bindgen_test::*;
 
     fn compare_bytes(a: v128, b: v128) {
         let a: [u8; 16] = unsafe { transmute(a) };
@@ -1618,17 +2081,15 @@ pub mod tests {
         assert_eq!(a, b);
     }
 
-    #[wasm_bindgen_test]
-    #[cfg(not(only_node_compatible_functions))]
+    #[test]
     fn test_v128_const() {
-        const A: v128 =
-            unsafe { super::v128_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) };
+        const A: v128 = super::v128_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
         compare_bytes(A, A);
     }
 
     macro_rules! test_splat {
         ($test_id:ident: $val:expr => $($vals:expr),*) => {
-            #[wasm_bindgen_test]
+            #[test]
             fn $test_id() {
                 let a = super::$test_id($val);
                 let b: v128 = unsafe {
@@ -1642,10 +2103,8 @@ pub mod tests {
     test_splat!(i8x16_splat: 42 => 42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42);
     test_splat!(i16x8_splat: 42 => 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0);
     test_splat!(i32x4_splat: 42 => 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0);
-    #[cfg(not(only_node_compatible_functions))]
     test_splat!(i64x2_splat: 42 => 42, 0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0);
     test_splat!(f32x4_splat: 42. => 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66);
-    #[cfg(not(only_node_compatible_functions))]
     test_splat!(f64x2_splat: 42. => 0, 0, 0, 0, 0, 0, 69, 64, 0, 0, 0, 0, 0, 0, 69, 64);
 
     // tests extract and replace lanes
@@ -1658,14 +2117,14 @@ pub mod tests {
             count: $count:expr,
             indices: [$($idx:expr),*],
         ) => {
-            #[wasm_bindgen_test]
+            #[test]
             fn $test_id() {
                 unsafe {
                     let arr: [$elem; $count] = [123 as $elem; $count];
                     let vec: v128 = transmute(arr);
                     $(
                         assert_eq!($extract(vec, $idx), 123 as $elem);
-                    )*;
+                    )*
 
                     // create a vector from array and check that the indices contain
                     // the same values as in the array:
@@ -1676,7 +2135,7 @@ pub mod tests {
 
                         let tmp = $replace(vec, $idx, 124 as $elem);
                         assert_eq!($extract(tmp, $idx), 124 as $elem);
-                    )*;
+                    )*
                 }
             }
         }
@@ -1706,7 +2165,6 @@ pub mod tests {
         count: 4,
         indices: [0, 1, 2, 3],
     }
-    #[cfg(not(only_node_compatible_functions))]
     test_extract! {
         name: test_i64x2_extract_replace,
         extract: i64x2_extract_lane,
@@ -1723,7 +2181,6 @@ pub mod tests {
         count: 4,
         indices: [0, 1, 2, 3],
     }
-    #[cfg(not(only_node_compatible_functions))]
     test_extract! {
         name: test_f64x2_extract_replace,
         extract: f64x2_extract_lane,
@@ -1739,7 +2196,7 @@ pub mod tests {
                 $([$($vec1:tt)*] ($op:tt | $f:ident) [$($vec2:tt)*],)*
             })*
         ) => ($(
-            #[wasm_bindgen_test]
+            #[test]
             fn $name() {
                 unsafe {
                     $(
@@ -1768,7 +2225,7 @@ pub mod tests {
                 $(($op:tt | $f:ident) [$($vec1:tt)*],)*
             })*
         ) => ($(
-            #[wasm_bindgen_test]
+            #[test]
             fn $name() {
                 unsafe {
                     $(
@@ -1816,19 +2273,6 @@ pub mod tests {
                 (- | i8x16_sub)
             [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 4, 8],
         }
-        test_i8x16_mul => {
-            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-                (* | i8x16_mul)
-            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-
-            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
-                (* | i8x16_mul)
-            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
-
-            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
-                (* | i8x16_mul)
-            [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 30, 3],
-        }
 
         test_i16x8_add => {
             [0i16, 0, 0, 0, 0, 0, 0, 0]
@@ -1910,425 +2354,401 @@ pub mod tests {
         // TODO: test_i64x2_neg
     }
 
-    // #[wasm_bindgen_test]
-    // fn v8x16_shuffle() {
-    //     unsafe {
-    //         let a = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
-    //         let b = [
-    //             16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-    //             31,
-    //         ];
-    //
-    //         let vec_a: v128 = transmute(a);
-    //         let vec_b: v128 = transmute(b);
-    //
-    //         let vec_r = v8x16_shuffle!(
-    //             vec_a,
-    //             vec_b,
-    //             [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]
-    //         );
-    //
-    //         let e =
-    //             [0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30];
-    //         let vec_e: v128 = transmute(e);
-    //         compare_bytes(vec_r, vec_e);
-    //     }
-    // }
-    //
-    // macro_rules! floating_point {
-    //     (f32) => {
-    //         true
-    //     };
-    //     (f64) => {
-    //         true
-    //     };
-    //     ($id:ident) => {
-    //         false
-    //     };
-    // }
-    //
-    // trait IsNan: Sized {
-    //     fn is_nan(self) -> bool {
-    //         false
-    //     }
-    // }
-    // impl IsNan for i8 {}
-    // impl IsNan for i16 {}
-    // impl IsNan for i32 {}
-    // impl IsNan for i64 {}
-    //
-    // macro_rules! test_bop {
-    //     ($id:ident[$ety:ident; $ecount:expr] |
-    //      $binary_op:ident [$op_test_id:ident] :
-    //      ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
-    //         test_bop!(
-    //             $id[$ety; $ecount] => $ety | $binary_op [ $op_test_id ]:
-    //             ([$($in_a),*], [$($in_b),*]) => [$($out),*]
-    //         );
-    //
-    //     };
-    //     ($id:ident[$ety:ident; $ecount:expr] => $oty:ident |
-    //      $binary_op:ident [$op_test_id:ident] :
-    //      ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
-    //         #[wasm_bindgen_test]
-    //         fn $op_test_id() {
-    //             unsafe {
-    //                 let a_input: [$ety; $ecount] = [$($in_a),*];
-    //                 let b_input: [$ety; $ecount] = [$($in_b),*];
-    //                 let output: [$oty; $ecount] = [$($out),*];
-    //
-    //                 let a_vec_in: v128 = transmute(a_input);
-    //                 let b_vec_in: v128 = transmute(b_input);
-    //                 let vec_res: v128 = $id::$binary_op(a_vec_in, b_vec_in);
-    //
-    //                 let res: [$oty; $ecount] = transmute(vec_res);
-    //
-    //                 if !floating_point!($ety) {
-    //                     assert_eq!(res, output);
-    //                 } else {
-    //                     for i in 0..$ecount {
-    //                         let r = res[i];
-    //                         let o = output[i];
-    //                         assert_eq!(r.is_nan(), o.is_nan());
-    //                         if !r.is_nan() {
-    //                             assert_eq!(r, o);
-    //                         }
-    //                     }
-    //                 }
-    //             }
-    //         }
-    //     }
-    // }
-    //
-    // macro_rules! test_bops {
-    //     ($id:ident[$ety:ident; $ecount:expr] |
-    //      $binary_op:ident [$op_test_id:ident]:
-    //      ([$($in_a:expr),*], $in_b:expr) => [$($out:expr),*]) => {
-    //         #[wasm_bindgen_test]
-    //         fn $op_test_id() {
-    //             unsafe {
-    //                 let a_input: [$ety; $ecount] = [$($in_a),*];
-    //                 let output: [$ety; $ecount] = [$($out),*];
-    //
-    //                 let a_vec_in: v128 = transmute(a_input);
-    //                 let vec_res: v128 = $id::$binary_op(a_vec_in, $in_b);
-    //
-    //                 let res: [$ety; $ecount] = transmute(vec_res);
-    //                 assert_eq!(res, output);
-    //             }
-    //         }
-    //     }
-    // }
-    //
-    // macro_rules! test_uop {
-    //     ($id:ident[$ety:ident; $ecount:expr] |
-    //      $unary_op:ident [$op_test_id:ident]: [$($in_a:expr),*] => [$($out:expr),*]) => {
-    //         #[wasm_bindgen_test]
-    //         fn $op_test_id() {
-    //             unsafe {
-    //                 let a_input: [$ety; $ecount] = [$($in_a),*];
-    //                 let output: [$ety; $ecount] = [$($out),*];
-    //
-    //                 let a_vec_in: v128 = transmute(a_input);
-    //                 let vec_res: v128 = $id::$unary_op(a_vec_in);
-    //
-    //                 let res: [$ety; $ecount] = transmute(vec_res);
-    //                 assert_eq!(res, output);
-    //             }
-    //         }
-    //     }
-    // }
-    //
-    //
-    //
-    // test_bops!(i8x16[i8; 16] | shl[i8x16_shl_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
-    //            [0, -2, 4, 6, 8, 10, 12, -2, 2, 2, 2, 2, 2, 2, 2, 2]);
-    // test_bops!(i16x8[i16; 8] | shl[i16x8_shl_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
-    //            [0, -2, 4, 6, 8, 10, 12, -2]);
-    // test_bops!(i32x4[i32; 4] | shl[i32x4_shl_test]:
-    //            ([0, -1, 2, 3], 1) => [0, -2, 4, 6]);
-    // test_bops!(i64x2[i64; 2] | shl[i64x2_shl_test]:
-    //            ([0, -1], 1) => [0, -2]);
-    //
-    // test_bops!(i8x16[i8; 16] | shr_s[i8x16_shr_s_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
-    //            [0, -1, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
-    // test_bops!(i16x8[i16; 8] | shr_s[i16x8_shr_s_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
-    //            [0, -1, 1, 1, 2, 2, 3, i16::MAX / 2]);
-    // test_bops!(i32x4[i32; 4] | shr_s[i32x4_shr_s_test]:
-    //            ([0, -1, 2, 3], 1) => [0, -1, 1, 1]);
-    // test_bops!(i64x2[i64; 2] | shr_s[i64x2_shr_s_test]:
-    //            ([0, -1], 1) => [0, -1]);
-    //
-    // test_bops!(i8x16[i8; 16] | shr_u[i8x16_uhr_u_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
-    //            [0, i8::MAX, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
-    // test_bops!(i16x8[i16; 8] | shr_u[i16x8_uhr_u_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
-    //            [0, i16::MAX, 1, 1, 2, 2, 3, i16::MAX / 2]);
-    // test_bops!(i32x4[i32; 4] | shr_u[i32x4_uhr_u_test]:
-    //            ([0, -1, 2, 3], 1) => [0, i32::MAX, 1, 1]);
-    // test_bops!(i64x2[i64; 2] | shr_u[i64x2_uhr_u_test]:
-    //            ([0, -1], 1) => [0, i64::MAX]);
-    //
-    // #[wasm_bindgen_test]
-    // fn v128_bitwise_logical_ops() {
-    //     unsafe {
-    //         let a: [u32; 4] = [u32::MAX, 0, u32::MAX, 0];
-    //         let b: [u32; 4] = [u32::MAX; 4];
-    //         let c: [u32; 4] = [0; 4];
-    //
-    //         let vec_a: v128 = transmute(a);
-    //         let vec_b: v128 = transmute(b);
-    //         let vec_c: v128 = transmute(c);
-    //
-    //         let r: v128 = v128::and(vec_a, vec_a);
-    //         compare_bytes(r, vec_a);
-    //         let r: v128 = v128::and(vec_a, vec_b);
-    //         compare_bytes(r, vec_a);
-    //         let r: v128 = v128::or(vec_a, vec_b);
-    //         compare_bytes(r, vec_b);
-    //         let r: v128 = v128::not(vec_b);
-    //         compare_bytes(r, vec_c);
-    //         let r: v128 = v128::xor(vec_a, vec_c);
-    //         compare_bytes(r, vec_a);
-    //
-    //         let r: v128 = v128::bitselect(vec_b, vec_c, vec_b);
-    //         compare_bytes(r, vec_b);
-    //         let r: v128 = v128::bitselect(vec_b, vec_c, vec_c);
-    //         compare_bytes(r, vec_c);
-    //         let r: v128 = v128::bitselect(vec_b, vec_c, vec_a);
-    //         compare_bytes(r, vec_a);
-    //     }
-    // }
-    //
-    // macro_rules! test_bool_red {
-    //     ($id:ident[$test_id:ident] | [$($true:expr),*] | [$($false:expr),*] | [$($alt:expr),*]) => {
-    //         #[wasm_bindgen_test]
-    //         fn $test_id() {
-    //             unsafe {
-    //                 let vec_a: v128 = transmute([$($true),*]); // true
-    //                 let vec_b: v128 = transmute([$($false),*]); // false
-    //                 let vec_c: v128 = transmute([$($alt),*]); // alternating
-    //
-    //                 assert_eq!($id::any_true(vec_a), 1);
-    //                 assert_eq!($id::any_true(vec_b), 0);
-    //                 assert_eq!($id::any_true(vec_c), 1);
-    //
-    //                 assert_eq!($id::all_true(vec_a), 1);
-    //                 assert_eq!($id::all_true(vec_b), 0);
-    //                 assert_eq!($id::all_true(vec_c), 0);
-    //             }
-    //         }
-    //     }
-    // }
-    //
-    // test_bool_red!(
-    //     i8x16[i8x16_boolean_reductions]
-    //         | [1_i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-    //         | [0_i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-    //         | [1_i8, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
-    // );
-    // test_bool_red!(
-    //     i16x8[i16x8_boolean_reductions]
-    //         | [1_i16, 1, 1, 1, 1, 1, 1, 1]
-    //         | [0_i16, 0, 0, 0, 0, 0, 0, 0]
-    //         | [1_i16, 0, 1, 0, 1, 0, 1, 0]
-    // );
-    // test_bool_red!(
-    //     i32x4[i32x4_boolean_reductions]
-    //         | [1_i32, 1, 1, 1]
-    //         | [0_i32, 0, 0, 0]
-    //         | [1_i32, 0, 1, 0]
-    // );
-    // test_bool_red!(
-    //     i64x2[i64x2_boolean_reductions] | [1_i64, 1] | [0_i64, 0] | [1_i64, 0]
-    // );
-    //
-    // test_bop!(i8x16[i8; 16] | eq[i8x16_eq_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    //            [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i16x8[i16; 8] | eq[i16x8_eq_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i32x4[i32; 4] | eq[i32x4_eq_test]:
-    //           ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
-    // test_bop!(i64x2[i64; 2] | eq[i64x2_eq_test]: ([0, 1], [0, 2]) => [-1, 0]);
-    // test_bop!(f32x4[f32; 4] => i32 | eq[f32x4_eq_test]:
-    //           ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
-    // test_bop!(f64x2[f64; 2] => i64 | eq[f64x2_eq_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
-    //
-    // test_bop!(i8x16[i8; 16] | ne[i8x16_ne_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    //            [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i16x8[i16; 8] | ne[i16x8_ne_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i32x4[i32; 4] | ne[i32x4_ne_test]:
-    //           ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
-    // test_bop!(i64x2[i64; 2] | ne[i64x2_ne_test]: ([0, 1], [0, 2]) => [0, -1]);
-    // test_bop!(f32x4[f32; 4] => i32 | ne[f32x4_ne_test]:
-    //           ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
-    // test_bop!(f64x2[f64; 2] => i64 | ne[f64x2_ne_test]: ([0., 1.], [0., 2.]) => [0, -1]);
-    //
-    // test_bop!(i8x16[i8; 16] | lt[i8x16_lt_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    //            [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i16x8[i16; 8] | lt[i16x8_lt_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i32x4[i32; 4] | lt[i32x4_lt_test]:
-    //           ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
-    // test_bop!(i64x2[i64; 2] | lt[i64x2_lt_test]: ([0, 1], [0, 2]) => [0, -1]);
-    // test_bop!(f32x4[f32; 4] => i32 | lt[f32x4_lt_test]:
-    //           ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
-    // test_bop!(f64x2[f64; 2] => i64 | lt[f64x2_lt_test]: ([0., 1.], [0., 2.]) => [0, -1]);
-    //
-    // test_bop!(i8x16[i8; 16] | gt[i8x16_gt_test]:
-    //       ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
-    //        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i16x8[i16; 8] | gt[i16x8_gt_test]:
-    //           ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i32x4[i32; 4] | gt[i32x4_gt_test]:
-    //           ([0, 2, 2, 4], [0, 1, 2, 3]) => [0, -1, 0, -1]);
-    // test_bop!(i64x2[i64; 2] | gt[i64x2_gt_test]: ([0, 2], [0, 1]) => [0, -1]);
-    // test_bop!(f32x4[f32; 4] => i32 | gt[f32x4_gt_test]:
-    //           ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [0, -1, 0, -1]);
-    // test_bop!(f64x2[f64; 2] => i64 | gt[f64x2_gt_test]: ([0., 2.], [0., 1.]) => [0, -1]);
-    //
-    // test_bop!(i8x16[i8; 16] | ge[i8x16_ge_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    //            [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i16x8[i16; 8] | ge[i16x8_ge_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i32x4[i32; 4] | ge[i32x4_ge_test]:
-    //           ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
-    // test_bop!(i64x2[i64; 2] | ge[i64x2_ge_test]: ([0, 1], [0, 2]) => [-1, 0]);
-    // test_bop!(f32x4[f32; 4] => i32 | ge[f32x4_ge_test]:
-    //           ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
-    // test_bop!(f64x2[f64; 2] => i64 | ge[f64x2_ge_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
-    //
-    // test_bop!(i8x16[i8; 16] | le[i8x16_le_test]:
-    //           ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
-    //            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
-    //           ) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i16x8[i16; 8] | le[i16x8_le_test]:
-    //           ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i32x4[i32; 4] | le[i32x4_le_test]:
-    //           ([0, 2, 2, 4], [0, 1, 2, 3]) => [-1, 0, -1, 0]);
-    // test_bop!(i64x2[i64; 2] | le[i64x2_le_test]: ([0, 2], [0, 1]) => [-1, 0]);
-    // test_bop!(f32x4[f32; 4] => i32 | le[f32x4_le_test]:
-    //           ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [-1, 0, -1, -0]);
-    // test_bop!(f64x2[f64; 2] => i64 | le[f64x2_le_test]: ([0., 2.], [0., 1.]) => [-1, 0]);
-    //
-    // #[wasm_bindgen_test]
-    // fn v128_bitwise_load_store() {
-    //     unsafe {
-    //         let mut arr: [i32; 4] = [0, 1, 2, 3];
-    //
-    //         let vec = v128::load(arr.as_ptr() as *const v128);
-    //         let vec = i32x4::add(vec, vec);
-    //         v128::store(arr.as_mut_ptr() as *mut v128, vec);
-    //
-    //         assert_eq!(arr, [0, 2, 4, 6]);
-    //     }
-    // }
-    //
-    // test_uop!(f32x4[f32; 4] | neg[f32x4_neg_test]: [0., 1., 2., 3.] => [ 0., -1., -2., -3.]);
-    // test_uop!(f32x4[f32; 4] | abs[f32x4_abs_test]: [0., -1., 2., -3.] => [ 0., 1., 2., 3.]);
-    // test_bop!(f32x4[f32; 4] | min[f32x4_min_test]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., -3., -4., 8.]);
-    // test_bop!(f32x4[f32; 4] | min[f32x4_min_test_nan]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
-    //           => [0., -3., -4., std::f32::NAN]);
-    // test_bop!(f32x4[f32; 4] | max[f32x4_max_test]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -1., 7., 10.]);
-    // test_bop!(f32x4[f32; 4] | max[f32x4_max_test_nan]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
-    //           => [1., -1., 7., std::f32::NAN]);
-    // test_bop!(f32x4[f32; 4] | add[f32x4_add_test]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -4., 3., 18.]);
-    // test_bop!(f32x4[f32; 4] | sub[f32x4_sub_test]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [-1., 2., 11., -2.]);
-    // test_bop!(f32x4[f32; 4] | mul[f32x4_mul_test]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., 3., -28., 80.]);
-    // test_bop!(f32x4[f32; 4] | div[f32x4_div_test]:
-    //           ([0., -8., 70., 8.], [1., 4., 10., 2.]) => [0., -2., 7., 4.]);
-    //
-    // test_uop!(f64x2[f64; 2] | neg[f64x2_neg_test]: [0., 1.] => [ 0., -1.]);
-    // test_uop!(f64x2[f64; 2] | abs[f64x2_abs_test]: [0., -1.] => [ 0., 1.]);
-    // test_bop!(f64x2[f64; 2] | min[f64x2_min_test]:
-    //           ([0., -1.], [1., -3.]) => [0., -3.]);
-    // test_bop!(f64x2[f64; 2] | min[f64x2_min_test_nan]:
-    //           ([7., 8.], [-4., std::f64::NAN])
-    //           => [ -4., std::f64::NAN]);
-    // test_bop!(f64x2[f64; 2] | max[f64x2_max_test]:
-    //           ([0., -1.], [1., -3.]) => [1., -1.]);
-    // test_bop!(f64x2[f64; 2] | max[f64x2_max_test_nan]:
-    //           ([7., 8.], [ -4., std::f64::NAN])
-    //           => [7., std::f64::NAN]);
-    // test_bop!(f64x2[f64; 2] | add[f64x2_add_test]:
-    //           ([0., -1.], [1., -3.]) => [1., -4.]);
-    // test_bop!(f64x2[f64; 2] | sub[f64x2_sub_test]:
-    //           ([0., -1.], [1., -3.]) => [-1., 2.]);
-    // test_bop!(f64x2[f64; 2] | mul[f64x2_mul_test]:
-    //           ([0., -1.], [1., -3.]) => [0., 3.]);
-    // test_bop!(f64x2[f64; 2] | div[f64x2_div_test]:
-    //           ([0., -8.], [1., 4.]) => [0., -2.]);
-    //
-    // macro_rules! test_conv {
-    //     ($test_id:ident | $conv_id:ident | $to_ty:ident | $from:expr,  $to:expr) => {
-    //         #[wasm_bindgen_test]
-    //         fn $test_id() {
-    //             unsafe {
-    //                 let from: v128 = transmute($from);
-    //                 let to: v128 = transmute($to);
-    //
-    //                 let r: v128 = $to_ty::$conv_id(from);
-    //
-    //                 compare_bytes(r, to);
-    //             }
-    //         }
-    //     };
-    // }
-    //
-    // test_conv!(
-    //     f32x4_convert_s_i32x4 | convert_s_i32x4 | f32x4 | [1_i32, 2, 3, 4],
-    //     [1_f32, 2., 3., 4.]
-    // );
-    // test_conv!(
-    //     f32x4_convert_u_i32x4
-    //         | convert_u_i32x4
-    //         | f32x4
-    //         | [u32::MAX, 2, 3, 4],
-    //     [u32::MAX as f32, 2., 3., 4.]
-    // );
-    // test_conv!(
-    //     f64x2_convert_s_i64x2 | convert_s_i64x2 | f64x2 | [1_i64, 2],
-    //     [1_f64, 2.]
-    // );
+    #[test]
+    fn v8x16_shuffle() {
+        unsafe {
+            let a = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+            let b = [
+                16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ];
+
+            let vec_a: v128 = transmute(a);
+            let vec_b: v128 = transmute(b);
+
+            let vec_r = v8x16_shuffle!(
+                vec_a, vec_b, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30,
+            );
+
+            let e = [0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30];
+            let vec_e: v128 = transmute(e);
+            compare_bytes(vec_r, vec_e);
+        }
+    }
+
+    macro_rules! floating_point {
+        (f32) => {
+            true
+        };
+        (f64) => {
+            true
+        };
+        ($id:ident) => {
+            false
+        };
+    }
+
+    trait IsNan: Sized {
+        fn is_nan(self) -> bool {
+            false
+        }
+    }
+    impl IsNan for i8 {}
+    impl IsNan for i16 {}
+    impl IsNan for i32 {}
+    impl IsNan for i64 {}
+
+    macro_rules! test_bop {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $binary_op:ident [$op_test_id:ident] :
+          ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+             test_bop!(
+                 $id[$ety; $ecount] => $ety | $binary_op [ $op_test_id ]:
+                 ([$($in_a),*], [$($in_b),*]) => [$($out),*]
+             );
+
+         };
+         ($id:ident[$ety:ident; $ecount:expr] => $oty:ident |
+          $binary_op:ident [$op_test_id:ident] :
+          ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let b_input: [$ety; $ecount] = [$($in_b),*];
+                     let output: [$oty; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let b_vec_in: v128 = transmute(b_input);
+                     let vec_res: v128 = $binary_op(a_vec_in, b_vec_in);
+
+                     let res: [$oty; $ecount] = transmute(vec_res);
+
+                     if !floating_point!($ety) {
+                         assert_eq!(res, output);
+                     } else {
+                         for i in 0..$ecount {
+                             let r = res[i];
+                             let o = output[i];
+                             assert_eq!(r.is_nan(), o.is_nan());
+                             if !r.is_nan() {
+                                 assert_eq!(r, o);
+                             }
+                         }
+                     }
+                 }
+             }
+         }
+     }
+
+    macro_rules! test_bops {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $binary_op:ident [$op_test_id:ident]:
+          ([$($in_a:expr),*], $in_b:expr) => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let output: [$ety; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let vec_res: v128 = $binary_op(a_vec_in, $in_b);
+
+                     let res: [$ety; $ecount] = transmute(vec_res);
+                     assert_eq!(res, output);
+                 }
+             }
+         }
+     }
+
+    macro_rules! test_uop {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $unary_op:ident [$op_test_id:ident]: [$($in_a:expr),*] => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let output: [$ety; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let vec_res: v128 = $unary_op(a_vec_in);
+
+                     let res: [$ety; $ecount] = transmute(vec_res);
+                     assert_eq!(res, output);
+                 }
+             }
+         }
+     }
+
+    test_bops!(i8x16[i8; 16] | i8x16_shl[i8x16_shl_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, -2, 4, 6, 8, 10, 12, -2, 2, 2, 2, 2, 2, 2, 2, 2]);
+    test_bops!(i16x8[i16; 8] | i16x8_shl[i16x8_shl_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+                [0, -2, 4, 6, 8, 10, 12, -2]);
+    test_bops!(i32x4[i32; 4] | i32x4_shl[i32x4_shl_test]:
+                ([0, -1, 2, 3], 1) => [0, -2, 4, 6]);
+    test_bops!(i64x2[i64; 2] | i64x2_shl[i64x2_shl_test]:
+                ([0, -1], 1) => [0, -2]);
+
+    test_bops!(i8x16[i8; 16] | i8x16_shr_s[i8x16_shr_s_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, -1, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bops!(i16x8[i16; 8] | i16x8_shr_s[i16x8_shr_s_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+               [0, -1, 1, 1, 2, 2, 3, i16::MAX / 2]);
+    test_bops!(i32x4[i32; 4] | i32x4_shr_s[i32x4_shr_s_test]:
+               ([0, -1, 2, 3], 1) => [0, -1, 1, 1]);
+    test_bops!(i64x2[i64; 2] | i64x2_shr_s[i64x2_shr_s_test]:
+               ([0, -1], 1) => [0, -1]);
+
+    test_bops!(i8x16[i8; 16] | i8x16_shr_u[i8x16_uhr_u_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+                [0, i8::MAX, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bops!(i16x8[i16; 8] | i16x8_shr_u[i16x8_uhr_u_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+                [0, i16::MAX, 1, 1, 2, 2, 3, i16::MAX / 2]);
+    test_bops!(i32x4[i32; 4] | i32x4_shr_u[i32x4_uhr_u_test]:
+                ([0, -1, 2, 3], 1) => [0, i32::MAX, 1, 1]);
+    test_bops!(i64x2[i64; 2] | i64x2_shr_u[i64x2_uhr_u_test]:
+                ([0, -1], 1) => [0, i64::MAX]);
+
+    #[test]
+    fn v128_bitwise_logical_ops() {
+        unsafe {
+            let a: [u32; 4] = [u32::MAX, 0, u32::MAX, 0];
+            let b: [u32; 4] = [u32::MAX; 4];
+            let c: [u32; 4] = [0; 4];
+
+            let vec_a: v128 = transmute(a);
+            let vec_b: v128 = transmute(b);
+            let vec_c: v128 = transmute(c);
+
+            let r: v128 = v128_and(vec_a, vec_a);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128_and(vec_a, vec_b);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128_or(vec_a, vec_b);
+            compare_bytes(r, vec_b);
+            let r: v128 = v128_not(vec_b);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_xor(vec_a, vec_c);
+            compare_bytes(r, vec_a);
+
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_b);
+            compare_bytes(r, vec_b);
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_c);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_a);
+            compare_bytes(r, vec_a);
+        }
+    }
+
+    macro_rules! test_bool_red {
+         ([$test_id:ident, $any:ident, $all:ident] | [$($true:expr),*] | [$($false:expr),*] | [$($alt:expr),*]) => {
+             #[test]
+             fn $test_id() {
+                 unsafe {
+                     let vec_a: v128 = transmute([$($true),*]); // true
+                     let vec_b: v128 = transmute([$($false),*]); // false
+                     let vec_c: v128 = transmute([$($alt),*]); // alternating
+
+                     assert_eq!($any(vec_a), 1);
+                     assert_eq!($any(vec_b), 0);
+                     assert_eq!($any(vec_c), 1);
+
+                     assert_eq!($all(vec_a), 1);
+                     assert_eq!($all(vec_b), 0);
+                     assert_eq!($all(vec_c), 0);
+                 }
+             }
+         }
+     }
+
+    test_bool_red!(
+        [i8x16_boolean_reductions, i8x16_any_true, i8x16_all_true]
+            | [1_i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+            | [0_i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+            | [1_i8, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
+    );
+    test_bool_red!(
+        [i16x8_boolean_reductions, i16x8_any_true, i16x8_all_true]
+            | [1_i16, 1, 1, 1, 1, 1, 1, 1]
+            | [0_i16, 0, 0, 0, 0, 0, 0, 0]
+            | [1_i16, 0, 1, 0, 1, 0, 1, 0]
+    );
+    test_bool_red!(
+        [i32x4_boolean_reductions, i32x4_any_true, i32x4_all_true]
+            | [1_i32, 1, 1, 1]
+            | [0_i32, 0, 0, 0]
+            | [1_i32, 0, 1, 0]
+    );
+
+    test_bop!(i8x16[i8; 16] | i8x16_eq[i8x16_eq_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+               [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+              [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_eq[i16x8_eq_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_eq[i32x4_eq_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_eq[f32x4_eq_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_eq[f64x2_eq_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_ne[i8x16_ne_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_ne[i16x8_ne_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_ne[i32x4_ne_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_ne[f32x4_ne_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_ne[f64x2_ne_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_lt_s[i8x16_lt_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_lt_s[i16x8_lt_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_lt_s[i32x4_lt_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_lt[f32x4_lt_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_lt[f64x2_lt_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_gt_s[i8x16_gt_test]:
+           ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_gt_s[i16x8_gt_test]:
+               ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_gt_s[i32x4_gt_test]:
+               ([0, 2, 2, 4], [0, 1, 2, 3]) => [0, -1, 0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_gt[f32x4_gt_test]:
+               ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_gt[f64x2_gt_test]: ([0., 2.], [0., 1.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_ge_s[i8x16_ge_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_ge_s[i16x8_ge_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_ge_s[i32x4_ge_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_ge[f32x4_ge_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_ge[f64x2_ge_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_le_s[i8x16_le_test]:
+               ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
+                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+               ) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_le_s[i16x8_le_test]:
+               ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_le_s[i32x4_le_test]:
+               ([0, 2, 2, 4], [0, 1, 2, 3]) => [-1, 0, -1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_le[f32x4_le_test]:
+               ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [-1, 0, -1, -0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_le[f64x2_le_test]: ([0., 2.], [0., 1.]) => [-1, 0]);
+
+    #[test]
+    fn v128_bitwise_load_store() {
+        unsafe {
+            let mut arr: [i32; 4] = [0, 1, 2, 3];
+
+            let vec = v128_load(arr.as_ptr() as *const v128);
+            let vec = i32x4_add(vec, vec);
+            v128_store(arr.as_mut_ptr() as *mut v128, vec);
+
+            assert_eq!(arr, [0, 2, 4, 6]);
+        }
+    }
+
+    test_uop!(f32x4[f32; 4] | f32x4_neg[f32x4_neg_test]: [0., 1., 2., 3.] => [ 0., -1., -2., -3.]);
+    test_uop!(f32x4[f32; 4] | f32x4_abs[f32x4_abs_test]: [0., -1., 2., -3.] => [ 0., 1., 2., 3.]);
+    test_bop!(f32x4[f32; 4] | f32x4_min[f32x4_min_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., -3., -4., 8.]);
+    test_bop!(f32x4[f32; 4] | f32x4_min[f32x4_min_test_nan]:
+              ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
+              => [0., -3., -4., std::f32::NAN]);
+    test_bop!(f32x4[f32; 4] | f32x4_max[f32x4_max_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -1., 7., 10.]);
+    test_bop!(f32x4[f32; 4] | f32x4_max[f32x4_max_test_nan]:
+              ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
+              => [1., -1., 7., std::f32::NAN]);
+    test_bop!(f32x4[f32; 4] | f32x4_add[f32x4_add_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -4., 3., 18.]);
+    test_bop!(f32x4[f32; 4] | f32x4_sub[f32x4_sub_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [-1., 2., 11., -2.]);
+    test_bop!(f32x4[f32; 4] | f32x4_mul[f32x4_mul_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., 3., -28., 80.]);
+    test_bop!(f32x4[f32; 4] | f32x4_div[f32x4_div_test]:
+              ([0., -8., 70., 8.], [1., 4., 10., 2.]) => [0., -2., 7., 4.]);
+
+    test_uop!(f64x2[f64; 2] | f64x2_neg[f64x2_neg_test]: [0., 1.] => [ 0., -1.]);
+    test_uop!(f64x2[f64; 2] | f64x2_abs[f64x2_abs_test]: [0., -1.] => [ 0., 1.]);
+    test_bop!(f64x2[f64; 2] | f64x2_min[f64x2_min_test]:
+               ([0., -1.], [1., -3.]) => [0., -3.]);
+    test_bop!(f64x2[f64; 2] | f64x2_min[f64x2_min_test_nan]:
+               ([7., 8.], [-4., std::f64::NAN])
+               => [ -4., std::f64::NAN]);
+    test_bop!(f64x2[f64; 2] | f64x2_max[f64x2_max_test]:
+               ([0., -1.], [1., -3.]) => [1., -1.]);
+    test_bop!(f64x2[f64; 2] | f64x2_max[f64x2_max_test_nan]:
+               ([7., 8.], [ -4., std::f64::NAN])
+               => [7., std::f64::NAN]);
+    test_bop!(f64x2[f64; 2] | f64x2_add[f64x2_add_test]:
+               ([0., -1.], [1., -3.]) => [1., -4.]);
+    test_bop!(f64x2[f64; 2] | f64x2_sub[f64x2_sub_test]:
+               ([0., -1.], [1., -3.]) => [-1., 2.]);
+    test_bop!(f64x2[f64; 2] | f64x2_mul[f64x2_mul_test]:
+               ([0., -1.], [1., -3.]) => [0., 3.]);
+    test_bop!(f64x2[f64; 2] | f64x2_div[f64x2_div_test]:
+               ([0., -8.], [1., 4.]) => [0., -2.]);
+
+    macro_rules! test_conv {
+        ($test_id:ident | $conv_id:ident | $to_ty:ident | $from:expr,  $to:expr) => {
+            #[test]
+            fn $test_id() {
+                unsafe {
+                    let from: v128 = transmute($from);
+                    let to: v128 = transmute($to);
+
+                    let r: v128 = $conv_id(from);
+
+                    compare_bytes(r, to);
+                }
+            }
+        };
+    }
+
+    test_conv!(
+        f32x4_convert_s_i32x4 | f32x4_convert_i32x4_s | f32x4 | [1_i32, 2, 3, 4],
+        [1_f32, 2., 3., 4.]
+    );
+    test_conv!(
+        f32x4_convert_u_i32x4 | f32x4_convert_i32x4_u | f32x4 | [u32::MAX, 2, 3, 4],
+        [u32::MAX as f32, 2., 3., 4.]
+    );
+
+    // FIXME: this fails, and produces 0 instead of saturating at i32::MAX
     // test_conv!(
-    //     f64x2_convert_u_i64x2
-    //         | convert_u_i64x2
-    //         | f64x2
-    //         | [u64::MAX, 2],
-    //     [18446744073709552000.0, 2.]
+    //     i32x4_trunc_s_f32x4_sat
+    //         | i32x4_trunc_sat_f32x4_s
+    //         | i32x4
+    //         | [f32::NAN, 2., (i32::MAX as f32 + 1.), 4.],
+    //     [0, 2, i32::MAX, 4]
     // );
-    //
-    // // FIXME: this fails, and produces -2147483648 instead of saturating at
-    // // i32::MAX test_conv!(i32x4_trunc_s_f32x4_sat | trunc_s_f32x4_sat
-    // // | i32x4 | [1_f32, 2., (i32::MAX as f32 + 1.), 4.],
-    // // [1_i32, 2, i32::MAX, 4]); FIXME: add other saturating tests
+    // FIXME: add other saturating tests
 }
diff --git a/crates/core_arch/tests/xcrate-macros.rs b/crates/core_arch/tests/xcrate-macros.rs
new file mode 100644
index 0000000000..1b32a6c70d
--- /dev/null
+++ b/crates/core_arch/tests/xcrate-macros.rs
@@ -0,0 +1,18 @@
+#![feature(stdsimd)]
+
+#[test]
+#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+fn wut() {
+    use core_arch::arch::wasm32;
+    let a = wasm32::v128_const(0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    let b = wasm32::v128_const(
+        16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    );
+
+    let vec_r = unsafe {
+        wasm32::v8x16_shuffle!(a, b, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30,)
+    };
+
+    let e = wasm32::v128_const(0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+    assert_eq!(wasm32::i8x16_all_true(wasm32::i8x16_eq(e, vec_r)), 1);
+}
diff --git a/crates/std_detect/src/detect/mod.rs b/crates/std_detect/src/detect/mod.rs
index 77d1f7c506..c44f44c1b3 100644
--- a/crates/std_detect/src/detect/mod.rs
+++ b/crates/std_detect/src/detect/mod.rs
@@ -56,6 +56,7 @@ cfg_if! {
         mod arch;
     } else {
         // Unimplemented architecture:
+        #[allow(dead_code)]
         mod arch {
             #[doc(hidden)]
             pub(crate) enum Feature {
@@ -117,6 +118,7 @@ cfg_if! {
 
 /// Performs run-time feature detection.
 #[inline]
+#[allow(dead_code)]
 fn check_for(x: Feature) -> bool {
     cache::test(x as u32, self::os::detect_features)
 }
diff --git a/crates/std_detect/src/detect/os/other.rs b/crates/std_detect/src/detect/os/other.rs
index bf7be87f07..091fafc4eb 100644
--- a/crates/std_detect/src/detect/os/other.rs
+++ b/crates/std_detect/src/detect/os/other.rs
@@ -2,6 +2,7 @@
 
 use crate::detect::cache;
 
+#[allow(dead_code)]
 pub(crate) fn detect_features() -> cache::Initializer {
     cache::Initializer::default()
 }
diff --git a/crates/stdarch-test/Cargo.toml b/crates/stdarch-test/Cargo.toml
index 2b445f8dc5..2fc42db92a 100644
--- a/crates/stdarch-test/Cargo.toml
+++ b/crates/stdarch-test/Cargo.toml
@@ -11,10 +11,8 @@ lazy_static = "1.0"
 rustc-demangle = "0.1.8"
 cfg-if = "0.1"
 
-[target.wasm32-unknown-unknown.dependencies]
-wasm-bindgen = "0.2.47"
-js-sys = "0.3"
-console_error_panic_hook = "0.1"
+[target.'cfg(target_arch = "wasm32")'.dependencies]
+wasmprinter = "0.2.6"
 
 [features]
 default = []
diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
index fa73a7bba6..c66b6a8d9d 100644
--- a/crates/stdarch-test/src/lib.rs
+++ b/crates/stdarch-test/src/lib.rs
@@ -3,7 +3,6 @@
 //! This basically just disassembles the current executable and then parses the
 //! output once globally and then provides the `assert` function which makes
 //! assertions about the disassembly of a function.
-#![feature(const_transmute)]
 #![feature(vec_leak)]
 #![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)]
 
@@ -20,19 +19,8 @@ pub use assert_instr_macro::*;
 pub use simd_test_macro::*;
 use std::{cmp, collections::HashSet, env, hash, str, sync::atomic::AtomicPtr};
 
-// `println!` doesn't work on wasm32 right now, so shadow the compiler's `println!`
-// macro with our own shim that redirects to `console.log`.
-#[allow(unused)]
-#[cfg(target_arch = "wasm32")]
-#[macro_export]
-macro_rules! println {
-    ($($args:tt)*) => (crate::wasm::js_console_log(&format!($($args)*)))
-}
-
 cfg_if! {
     if #[cfg(target_arch = "wasm32")] {
-        extern crate wasm_bindgen;
-        extern crate console_error_panic_hook;
         pub mod wasm;
         use wasm::disassemble_myself;
     } else {
diff --git a/crates/stdarch-test/src/wasm.rs b/crates/stdarch-test/src/wasm.rs
index 612ff10d90..bf411c1214 100644
--- a/crates/stdarch-test/src/wasm.rs
+++ b/crates/stdarch-test/src/wasm.rs
@@ -1,49 +1,17 @@
 //! Disassembly calling function for `wasm32` targets.
-use wasm_bindgen::prelude::*;
 
 use crate::Function;
 use std::collections::HashSet;
 
-#[wasm_bindgen(module = "child_process")]
-extern "C" {
-    #[wasm_bindgen(js_name = execFileSync)]
-    fn exec_file_sync(cmd: &str, args: &js_sys::Array, opts: &js_sys::Object) -> Buffer;
-}
-
-#[wasm_bindgen(module = "buffer")]
-extern "C" {
-    type Buffer;
-    #[wasm_bindgen(method, js_name = toString)]
-    fn to_string(this: &Buffer) -> String;
-}
-
-#[wasm_bindgen]
-extern "C" {
-    #[wasm_bindgen(js_namespace = require)]
-    fn resolve(module: &str) -> String;
-    #[wasm_bindgen(js_namespace = console, js_name = log)]
-    pub fn js_console_log(s: &str);
-}
-
 pub(crate) fn disassemble_myself() -> HashSet<Function> {
-    use std::path::Path;
-    ::console_error_panic_hook::set_once();
-    // Our wasm module in the wasm-bindgen test harness is called
-    // "wasm-bindgen-test_bg". When running in node this is actually a shim JS
-    // file. Ask node where that JS file is, and then we use that with a wasm
-    // extension to find the wasm file itself.
-    let js_shim = resolve("wasm-bindgen-test");
-    let js_shim = Path::new(&js_shim).with_file_name("wasm-bindgen-test_bg.wasm");
-
-    // Execute `wasm2wat` synchronously, waiting for and capturing all of its
-    // output. Note that we pass in a custom `maxBuffer` parameter because we're
-    // generating a ton of output that needs to be buffered.
-    let args = js_sys::Array::new();
-    args.push(&js_shim.display().to_string().into());
-    args.push(&"--enable-simd".into());
-    let opts = js_sys::Object::new();
-    js_sys::Reflect::set(&opts, &"maxBuffer".into(), &(200 * 1024 * 1024).into()).unwrap();
-    let output = exec_file_sync("wasm2wat", &args, &opts).to_string();
+    // Use `std::env::args` to find the path to our executable. Assume the
+    // environment is configured such that we can read that file. Read it and
+    // use the `wasmprinter` crate to transform the binary to text, then search
+    // the text for appropriately named functions.
+    let me = std::env::args()
+        .next()
+        .expect("failed to find current wasm file");
+    let output = wasmprinter::print_file(&me).unwrap();
 
     let mut ret: HashSet<Function> = HashSet::new();
     let mut lines = output.lines().map(|s| s.trim());
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
index 6f00d46230..72599b4182 100644
--- a/examples/Cargo.toml
+++ b/examples/Cargo.toml
@@ -7,16 +7,14 @@ authors = [
     "Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>",
 ]
 description = "Examples of the stdarch crate."
+edition = "2018"
 
 [dependencies]
 core_arch = { path = "../crates/core_arch" }
 std_detect = { path = "../crates/std_detect" }
-quickcheck = "0.8"
+quickcheck = "0.9"
 rand = "0.7"
 
-[target.'cfg(target_arch = "wasm32")'.dependencies]
-rand = { version = "0.6", features = ["wasm-bindgen"] }
-
 [[bin]]
 name = "hex"
 path = "hex.rs"
diff --git a/examples/hex.rs b/examples/hex.rs
index b3d6fb0786..d9818d03e5 100644
--- a/examples/hex.rs
+++ b/examples/hex.rs
@@ -25,25 +25,15 @@
     clippy::missing_docs_in_private_items
 )]
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[macro_use(is_x86_feature_detected)]
-extern crate std_detect;
-
-extern crate core_arch;
-
-#[cfg(test)]
-#[macro_use]
-extern crate quickcheck;
-
 use std::{
     io::{self, Read},
     str,
 };
 
 #[cfg(target_arch = "x86")]
-use core_arch::x86::*;
+use {core_arch::arch::x86::*, std_detect::is_x86_feature_detected};
 #[cfg(target_arch = "x86_64")]
-use core_arch::x86_64::*;
+use {core_arch::arch::x86_64::*, std_detect::is_x86_feature_detected};
 
 fn main() {
     let mut input = Vec::new();
@@ -68,6 +58,12 @@ fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
             return unsafe { hex_encode_sse41(src, dst) };
         }
     }
+    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+    {
+        if true {
+            return unsafe { hex_encode_simd128(src, dst) };
+        }
+    }
 
     hex_encode_fallback(src, dst)
 }
@@ -157,6 +153,53 @@ unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a
     Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
 }
 
+#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+unsafe fn hex_encode_simd128<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    use core_arch::arch::wasm32::*;
+
+    let ascii_zero = i8x16_splat(b'0' as i8);
+    let nines = i8x16_splat(9);
+    let ascii_a = i8x16_splat((b'a' - 9 - 1) as i8);
+    let and4bits = i8x16_splat(0xf);
+
+    let mut i = 0_isize;
+    while src.len() >= 16 {
+        let invec = v128_load(src.as_ptr() as *const _);
+
+        let masked1 = v128_and(invec, and4bits);
+        let masked2 = v128_and(i8x16_shr_u(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = i8x16_gt_u(masked1, nines);
+        let cmpmask2 = i8x16_gt_u(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = i8x16_add(masked1, v128_bitselect(ascii_a, ascii_zero, cmpmask1));
+        let masked2 = i8x16_add(masked2, v128_bitselect(ascii_a, ascii_zero, cmpmask2));
+
+        // Next we need to shuffle around masked{1,2} to get back to the
+        // original source text order. The first element (res1) we'll store uses
+        // all the low bytes from the 2 masks and the second element (res2) uses
+        // all the upper bytes.
+        let res1 = v8x16_shuffle!(
+            masked2, masked1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+        );
+        let res2 = v8x16_shuffle!(
+            masked2, masked1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+        );
+
+        v128_store(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
+        v128_store(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
+        src = &src[16..];
+        i += 16;
+    }
+
+    let i = i as usize;
+    let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
+
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
+}
+
 fn hex_encode_fallback<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
     fn hex(byte: u8) -> u8 {
         static TABLE: &[u8] = b"0123456789abcdef";
@@ -186,10 +229,10 @@ mod tests {
 
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         unsafe {
-            if is_x86_feature_detected!("avx2") {
+            if self::is_x86_feature_detected!("avx2") {
                 assert_eq!(hex_encode_avx2(input, &mut tmp()).unwrap(), output);
             }
-            if is_x86_feature_detected!("sse4.1") {
+            if self::is_x86_feature_detected!("sse4.1") {
                 assert_eq!(hex_encode_sse41(input, &mut tmp()).unwrap(), output);
             }
         }
@@ -236,7 +279,7 @@ mod tests {
         );
     }
 
-    quickcheck! {
+    quickcheck::quickcheck! {
         fn encode_equals_fallback(input: Vec<u8>) -> bool {
             let mut space1 = vec![0; input.len() * 2];
             let mut space2 = vec![0; input.len() * 2];
@@ -247,7 +290,7 @@ mod tests {
 
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         fn avx_equals_fallback(input: Vec<u8>) -> bool {
-            if !is_x86_feature_detected!("avx2") {
+            if !self::is_x86_feature_detected!("avx2") {
                 return true
             }
             let mut space1 = vec![0; input.len() * 2];
@@ -259,7 +302,7 @@ mod tests {
 
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         fn sse41_equals_fallback(input: Vec<u8>) -> bool {
-            if !is_x86_feature_detected!("avx2") {
+            if !self::is_x86_feature_detected!("avx2") {
                 return true
             }
             let mut space1 = vec![0; input.len() * 2];
@@ -328,28 +371,28 @@ mod benches {
 
         #[bench]
         fn small_avx2(b: &mut test::Bencher) {
-            if is_x86_feature_detected!("avx2") {
+            if self::is_x86_feature_detected!("avx2") {
                 doit(b, SMALL_LEN, hex_encode_avx2);
             }
         }
 
         #[bench]
         fn small_sse41(b: &mut test::Bencher) {
-            if is_x86_feature_detected!("sse4.1") {
+            if self::is_x86_feature_detected!("sse4.1") {
                 doit(b, SMALL_LEN, hex_encode_sse41);
             }
         }
 
         #[bench]
         fn large_avx2(b: &mut test::Bencher) {
-            if is_x86_feature_detected!("avx2") {
+            if self::is_x86_feature_detected!("avx2") {
                 doit(b, LARGE_LEN, hex_encode_avx2);
             }
         }
 
         #[bench]
         fn large_sse41(b: &mut test::Bencher) {
-            if is_x86_feature_detected!("sse4.1") {
+            if self::is_x86_feature_detected!("sse4.1") {
                 doit(b, LARGE_LEN, hex_encode_sse41);
             }
         }
diff --git a/examples/wasm.rs b/examples/wasm.rs
index 53f9c55d4e..6b92ae9b87 100644
--- a/examples/wasm.rs
+++ b/examples/wasm.rs
@@ -3,11 +3,9 @@
 #![feature(stdsimd)]
 #![cfg(target_arch = "wasm32")]
 
-extern crate core_arch;
-
 use std::ptr;
 
-use core_arch::wasm32::*;
+use core_arch::arch::wasm32::*;
 
 static mut HEAD: *mut *mut u8 = 0 as _;
 

From 9eaebc271ae21a4eb87fc63498b52b9be3f4e3e2 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Tue, 14 Jul 2020 05:29:08 -0700
Subject: [PATCH 02/15] wasm: Unconditionally expose SIMD functions

This commit unconditionally exposes SIMD functions from the `wasm32`
module. This is done in such a way that the standard library does not
need to be recompiled to access SIMD intrinsics and use them. This,
hopefully, is the long-term story for SIMD in WebAssembly in Rust.

It's unlikely that all WebAssembly runtimes will end up implementing
SIMD so the standard library is unlikely to use SIMD any time soon, but
we want to make sure it's easily available to folks! This commit enables
all this by ensuring that SIMD is available to the standard library,
regardless of compilation flags.

This'll come with the same caveats as x86 support, where it doesn't make
sense to call these functions unless you're enabling simd support one
way or another locally. Additionally, as with x86, if you don't call
these functions then the instructions won't show up in your binary.

While I was here I went ahead and expanded the WebAssembly-specific
documentation for the wasm32 module as well, ensuring that the current
state of SIMD/Atomics are documented.
---
 crates/assert-instr-macro/src/lib.rs    |    7 +
 crates/core_arch/build.rs               |   13 +
 crates/core_arch/src/mod.rs             |  110 ++-
 crates/core_arch/src/wasm32/mod.rs      |    2 -
 crates/core_arch/src/wasm32/simd128.rs  | 1024 +++++++++++++----------
 crates/core_arch/tests/xcrate-macros.rs |   24 +-
 crates/stdarch-test/Cargo.toml          |    7 +-
 examples/hex.rs                         |    7 +-
 8 files changed, 731 insertions(+), 463 deletions(-)

diff --git a/crates/assert-instr-macro/src/lib.rs b/crates/assert-instr-macro/src/lib.rs
index 200f02fae5..0c03e80653 100644
--- a/crates/assert-instr-macro/src/lib.rs
+++ b/crates/assert-instr-macro/src/lib.rs
@@ -122,6 +122,13 @@ pub fn assert_instr(
             // generate some code that's hopefully very tight in terms of
             // codegen but is otherwise unique to prevent code from being
             // folded.
+            //
+            // This is avoided on Wasm32 right now since these functions aren't
+            // inlined which breaks our tests since each intrinsic looks like it
+            // calls functions. Turns out functions aren't similar enough to get
+            // merged on wasm32 anyway. This bug is tracked at
+            // rust-lang/rust#74320.
+            #[cfg(not(target_arch = "wasm32"))]
             ::stdarch_test::_DONT_DEDUP.store(
                 std::mem::transmute(#shim_name_str.as_bytes().as_ptr()),
                 std::sync::atomic::Ordering::Relaxed,
diff --git a/crates/core_arch/build.rs b/crates/core_arch/build.rs
index 4d65e9ddc3..e0c538ceb4 100644
--- a/crates/core_arch/build.rs
+++ b/crates/core_arch/build.rs
@@ -1,3 +1,16 @@
+use std::env;
+
 fn main() {
     println!("cargo:rustc-cfg=core_arch_docs");
+
+    // Used to tell our `#[assert_instr]` annotations that all simd intrinsics
+    // are available to test their codegen, since some are gated behind an extra
+    // `-Ctarget-feature=+unimplemented-simd128` that doesn't have any
+    // equivalent in `#[target_feature]` right now.
+    if env::var("RUSTFLAGS")
+        .unwrap_or_default()
+        .contains("unimplemented-simd128")
+    {
+        println!("cargo:rust-cfg:all_simd");
+    }
 }
diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs
index 19f61affdd..eda2254e29 100644
--- a/crates/core_arch/src/mod.rs
+++ b/crates/core_arch/src/mod.rs
@@ -59,14 +59,110 @@ pub mod arch {
 
     /// Platform-specific intrinsics for the `wasm32` platform.
     ///
-
-    /// # Availability
+    /// This module provides intrinsics specific to the WebAssembly
+    /// architecture. Here you'll find intrinsics necessary for leveraging
+    /// WebAssembly proposals such as [atomics] and [simd]. These proposals are
+    /// evolving over time and as such the support here is unstable and requires
+    /// the nightly channel. As WebAssembly proposals stabilize these functions
+    /// will also become stable.
     ///
-    /// Note that intrinsics gated by `target_feature = "atomics"` or `target_feature = "simd128"`
-    /// are only available **when the standard library itself is compiled with the the respective
-    /// target feature**. This version of the standard library is not obtainable via `rustup`,
-    /// but rather will require the standard library to be compiled from source.
-    /// See the [module documentation](../index.html) for more details.
+    /// [atomics]: https://github.com/webassembly/threads
+    /// [simd]: https://github.com/webassembly/simd
+    ///
+    /// See the [module documentation](../index.html) for general information
+    /// about the `arch` module and platform intrinsics.
+    ///
+    /// ## Atomics
+    ///
+    /// The [threads proposal][atomics] for WebAssembly adds a number of
+    /// instructions for dealing with multithreaded programs. Atomic
+    /// instructions can all be generated through `std::sync::atomic` types, but
+    /// some instructions have no equivalent in Rust such as
+    /// `memory.atomic.notify` so this module will provide these intrinsics.
+    ///
+    /// At this time, however, these intrinsics are only available **when the
+    /// standard library itself is compiled with atomics**. Compiling with
+    /// atomics is not enabled by default and requires passing
+    /// `-Ctarget-feature=+atomics` to rustc. The standard library shipped via
+    /// `rustup` is not compiled with atomics. To get access to these intrinsics
+    /// you'll need to compile the standard library from source with the
+    /// requisite compiler flags.
+    ///
+    /// ## SIMD
+    ///
+    /// The [simd proposa][simd] for WebAssembly adds a new `v128` type for a
+    /// 128-bit SIMD register. It also adds a large array of instructions to
+    /// operate on the `v128` type to perform data processing. The SIMD proposal
+    /// has been in progress for quite some time and many instructions have come
+    /// and gone. This module attempts to keep up with the proposal, but if you
+    /// notice anything awry please feel free to [open an
+    /// issue](https://github.com/rust-lang/stdarch/issues/new).
+    ///
+    /// It's important to be aware that the current state of development of SIMD
+    /// in WebAssembly is still somewhat early days. There's lots of pieces to
+    /// demo and prototype with, but discussions and support are still in
+    /// progress. There's a number of pitfalls and gotchas in various places,
+    /// which will attempt to be documented here, but there may be others
+    /// lurking!
+    ///
+    /// Using SIMD is intended to be similar to as you would on `x86_64`, for
+    /// example. You'd write a function such as:
+    ///
+    /// ```rust,ignore
+    /// #[cfg(target_arch = "wasm32")]
+    /// #[target_feature(enable = "simd128")]
+    /// unsafe fn uses_simd() {
+    ///     use std::arch::wasm32::*;
+    ///     // ...
+    /// }
+    /// ```
+    ///
+    /// Unlike `x86_64`, however, WebAssembly does not currently have dynamic
+    /// detection at runtime as to whether SIMD is supported (this is one of the
+    /// motivators for the [conditional sections proposal][condsections], but
+    /// that is still pretty early days). This means that your binary will
+    /// either have SIMD and can only run on engines which support SIMD, or it
+    /// will not have SIMD at all. For compatibility the standard library itself
+    /// does not use any SIMD internally. Determining how best to ship your
+    /// WebAssembly binary with SIMD is largely left up to you as it can can be
+    /// pretty nuanced depending on your situation.
+    ///
+    /// [condsections]: https://github.com/webassembly/conditional-sections
+    ///
+    /// To enable SIMD support at compile time you need to do one of two things:
+    ///
+    /// * First you can annotate functions with `#[target_feature(enable =
+    ///   "simd128")]`. This causes just that one function to have SIMD support
+    ///   available to it, and intrinsics will get inlined as usual in this
+    ///   situation.
+    ///
+    /// * Second you can compile your program with `-Ctarget-feature=+simd128`.
+    ///   This compilation flag blanket enables SIMD support for your entire
+    ///   compilation. Note that this does not include the standard library
+    ///   unless you recompile the standard library.
+    ///
+    /// If you enable SIMD via either of these routes then you'll have a
+    /// WebAssembly binary that uses SIMD instructions, and you'll need to ship
+    /// that accordingly. Also note that if you call SIMD intrinsics but don't
+    /// enable SIMD via either of these mechanisms, you'll still have SIMD
+    /// generated in your program. This means to generate a binary without SIMD
+    /// you'll need to avoid both options above plus calling into any intrinsics
+    /// in this module.
+    ///
+    /// > **Note**: Due to
+    /// > [rust-lang/rust#74320](https://github.com/rust-lang/rust/issues/74320)
+    /// > it's recommended to compile your entire program with SIMD support
+    /// > (using `RUSTFLAGS`) or otherwise functions may not be inlined
+    /// > correctly.
+    ///
+    /// > **Note**: LLVM's SIMD support is actually split into two features:
+    /// > `simd128` and `unimplemented-simd128`. Rust code can enable `simd128`
+    /// > with `#[target_feature]` (and test for it with `#[cfg(target_feature =
+    /// > "simd128")]`, but it cannot enable `unimplemented-simd128`. The only
+    /// > way to enable this feature is to compile with
+    /// > `-Ctarget-feature=+simd128,+unimplemented-simd128`. This second
+    /// > feature enables more recent instructions implemented in LLVM which
+    /// > haven't always had enough time to make their way to runtimes.
     #[cfg(any(target_arch = "wasm32", dox))]
     #[doc(cfg(target_arch = "wasm32"))]
     #[stable(feature = "simd_wasm32", since = "1.33.0")]
diff --git a/crates/core_arch/src/wasm32/mod.rs b/crates/core_arch/src/wasm32/mod.rs
index a8becb64ad..10f07ce610 100644
--- a/crates/core_arch/src/wasm32/mod.rs
+++ b/crates/core_arch/src/wasm32/mod.rs
@@ -8,9 +8,7 @@ mod atomic;
 #[cfg(any(target_feature = "atomics", dox))]
 pub use self::atomic::*;
 
-#[cfg(any(target_feature = "simd128", dox))]
 mod simd128;
-#[cfg(any(target_feature = "simd128", dox))]
 pub use self::simd128::*;
 
 mod memory;
diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 6c7599f4a2..76be73a98f 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -25,62 +25,73 @@ types! {
 #[allow(non_camel_case_types)]
 #[unstable(feature = "stdimd_internal", issue = "none")]
 pub(crate) trait v128Ext: Sized {
-    fn as_v128(self) -> v128;
+    unsafe fn as_v128(self) -> v128;
 
     #[inline]
-    fn as_u8x16(self) -> u8x16 {
-        unsafe { transmute(self.as_v128()) }
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_u8x16(self) -> u8x16 {
+        transmute(self.as_v128())
     }
 
     #[inline]
-    fn as_u16x8(self) -> u16x8 {
-        unsafe { transmute(self.as_v128()) }
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_u16x8(self) -> u16x8 {
+        transmute(self.as_v128())
     }
 
     #[inline]
-    fn as_u32x4(self) -> u32x4 {
-        unsafe { transmute(self.as_v128()) }
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_u32x4(self) -> u32x4 {
+        transmute(self.as_v128())
     }
 
     #[inline]
-    fn as_u64x2(self) -> u64x2 {
-        unsafe { transmute(self.as_v128()) }
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_u64x2(self) -> u64x2 {
+        transmute(self.as_v128())
     }
 
     #[inline]
-    fn as_i8x16(self) -> i8x16 {
-        unsafe { transmute(self.as_v128()) }
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_i8x16(self) -> i8x16 {
+        transmute(self.as_v128())
     }
 
     #[inline]
-    fn as_i16x8(self) -> i16x8 {
-        unsafe { transmute(self.as_v128()) }
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_i16x8(self) -> i16x8 {
+        transmute(self.as_v128())
     }
 
     #[inline]
-    fn as_i32x4(self) -> i32x4 {
-        unsafe { transmute(self.as_v128()) }
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_i32x4(self) -> i32x4 {
+        transmute(self.as_v128())
     }
 
     #[inline]
-    fn as_i64x2(self) -> i64x2 {
-        unsafe { transmute(self.as_v128()) }
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_i64x2(self) -> i64x2 {
+        transmute(self.as_v128())
     }
 
     #[inline]
-    fn as_f32x4(self) -> f32x4 {
-        unsafe { transmute(self.as_v128()) }
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_f32x4(self) -> f32x4 {
+        transmute(self.as_v128())
     }
 
     #[inline]
-    fn as_f64x2(self) -> f64x2 {
-        unsafe { transmute(self.as_v128()) }
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_f64x2(self) -> f64x2 {
+        transmute(self.as_v128())
     }
 }
 
 impl v128Ext for v128 {
     #[inline]
-    fn as_v128(self) -> Self {
+    #[target_feature(enable = "simd128")]
+    unsafe fn as_v128(self) -> Self {
         self
     }
 }
@@ -183,55 +194,63 @@ extern "C" {
 /// Loads a `v128` vector from the given heap address.
 #[inline]
 #[cfg_attr(test, assert_instr(v128.load))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn v128_load(m: *const v128) -> v128 {
-    ptr::read(m)
+    *m
 }
 
 /// Load eight 8-bit integers and sign extend each one to a 16-bit lane
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.load8x8_s))]
+#[cfg_attr(all(test, all_simd), assert_instr(i16x8.load8x8_s))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i16x8_load8x8_s(m: *const i8) -> v128 {
-    transmute(simd_cast::<_, i16x8>(ptr::read(m as *const i8x8)))
+    transmute(simd_cast::<_, i16x8>(*(m as *const i8x8)))
 }
 
 /// Load eight 8-bit integers and zero extend each one to a 16-bit lane
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.load8x8_u))]
+#[cfg_attr(all(test, all_simd), assert_instr(i16x8.load8x8_u))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i16x8_load8x8_u(m: *const u8) -> v128 {
-    transmute(simd_cast::<_, u16x8>(ptr::read(m as *const u8x8)))
+    transmute(simd_cast::<_, u16x8>(*(m as *const u8x8)))
 }
 
 /// Load four 16-bit integers and sign extend each one to a 32-bit lane
 #[inline]
-#[cfg_attr(test, assert_instr(i32x4.load16x4_s))]
+#[cfg_attr(all(test, all_simd), assert_instr(i32x4.load16x4_s))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i32x4_load16x4_s(m: *const i16) -> v128 {
-    transmute(simd_cast::<_, i32x4>(ptr::read(m as *const i16x4)))
+    transmute(simd_cast::<_, i32x4>(*(m as *const i16x4)))
 }
 
 /// Load four 16-bit integers and zero extend each one to a 32-bit lane
 #[inline]
-#[cfg_attr(test, assert_instr(i32x4.load16x4_u))]
+#[cfg_attr(all(test, all_simd), assert_instr(i32x4.load16x4_u))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i32x4_load16x4_u(m: *const u16) -> v128 {
-    transmute(simd_cast::<_, u32x4>(ptr::read(m as *const u16x4)))
+    transmute(simd_cast::<_, u32x4>(*(m as *const u16x4)))
 }
 
 /// Load two 32-bit integers and sign extend each one to a 64-bit lane
 #[inline]
-#[cfg_attr(test, assert_instr(i64x2.load32x2_s))]
+#[cfg_attr(all(test, all_simd), assert_instr(i64x2.load32x2_s))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i64x2_load32x2_s(m: *const i32) -> v128 {
-    transmute(simd_cast::<_, i64x2>(ptr::read(m as *const i32x2)))
+    transmute(simd_cast::<_, i64x2>(*(m as *const i32x2)))
 }
 
 /// Load two 32-bit integers and zero extend each one to a 64-bit lane
 #[inline]
-#[cfg_attr(test, assert_instr(i64x2.load32x2_u))]
+#[cfg_attr(all(test, all_simd), assert_instr(i64x2.load32x2_u))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i64x2_load32x2_u(m: *const u32) -> v128 {
-    transmute(simd_cast::<_, u64x2>(ptr::read(m as *const u32x2)))
+    transmute(simd_cast::<_, u64x2>(*(m as *const u32x2)))
 }
 
 /// Load a single element and splat to all lanes of a v128 vector.
 #[inline]
-#[cfg_attr(test, assert_instr(v8x16.load_splat))]
+#[cfg_attr(all(test, all_simd), assert_instr(v8x16.load_splat))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn v8x16_load_splat(m: *const u8) -> v128 {
     let v = *m;
     transmute(u8x16(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v))
@@ -239,7 +258,8 @@ pub unsafe fn v8x16_load_splat(m: *const u8) -> v128 {
 
 /// Load a single element and splat to all lanes of a v128 vector.
 #[inline]
-#[cfg_attr(test, assert_instr(v16x8.load_splat))]
+#[cfg_attr(all(test, all_simd), assert_instr(v16x8.load_splat))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn v16x8_load_splat(m: *const u16) -> v128 {
     let v = *m;
     transmute(u16x8(v, v, v, v, v, v, v, v))
@@ -247,7 +267,8 @@ pub unsafe fn v16x8_load_splat(m: *const u16) -> v128 {
 
 /// Load a single element and splat to all lanes of a v128 vector.
 #[inline]
-#[cfg_attr(test, assert_instr(v32x4.load_splat))]
+#[cfg_attr(all(test, all_simd), assert_instr(v32x4.load_splat))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn v32x4_load_splat(m: *const u32) -> v128 {
     let v = *m;
     transmute(u32x4(v, v, v, v))
@@ -255,7 +276,8 @@ pub unsafe fn v32x4_load_splat(m: *const u32) -> v128 {
 
 /// Load a single element and splat to all lanes of a v128 vector.
 #[inline]
-#[cfg_attr(test, assert_instr(v64x2.load_splat))]
+#[cfg_attr(all(test, all_simd), assert_instr(v64x2.load_splat))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn v64x2_load_splat(m: *const u64) -> v128 {
     let v = *m;
     transmute(u64x2(v, v))
@@ -264,8 +286,9 @@ pub unsafe fn v64x2_load_splat(m: *const u64) -> v128 {
 /// Stores a `v128` vector to the given heap address.
 #[inline]
 #[cfg_attr(test, assert_instr(v128.store))]
+#[target_feature(enable = "simd128")]
 pub unsafe fn v128_store(m: *mut v128, a: v128) {
-    ptr::write(m, a)
+    *m = a;
 }
 
 /// Materializes a constant SIMD value from the immediate operands.
@@ -274,7 +297,7 @@ pub unsafe fn v128_store(m: *mut v128, a: v128) {
 /// `imm` which provide the bits of the vector directly.
 #[inline]
 #[rustc_args_required_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)]
-#[cfg_attr(test, assert_instr(
+#[cfg_attr(all(test, all_simd), assert_instr(
     v128.const,
     a0 = 0,
     a1 = 1,
@@ -293,7 +316,8 @@ pub unsafe fn v128_store(m: *mut v128, a: v128) {
     a14 = 14,
     a15 = 15,
 ))]
-pub const fn v128_const(
+#[target_feature(enable = "simd128")]
+pub const unsafe fn v128_const(
     a0: u8,
     a1: u8,
     a2: u8,
@@ -315,14 +339,12 @@ pub const fn v128_const(
         imm: [u8; 16],
         vec: v128,
     }
-    unsafe {
-        U {
-            imm: [
-                a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15,
-            ],
-        }
-        .vec
+    U {
+        imm: [
+            a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15,
+        ],
     }
+    .vec
 }
 
 /// Returns a new vector with lanes selected from the lanes of the two input
@@ -418,8 +440,9 @@ pub struct __v8x16_shuffle_u8x16(
 /// indices outside of the range the resulting lane is 0.
 #[inline]
 #[cfg_attr(test, assert_instr(v8x16.swizzle))]
-pub fn v8x16_swizzle(a: v128, s: v128) -> v128 {
-    unsafe { transmute(llvm_swizzle(transmute(a), transmute(s))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn v8x16_swizzle(a: v128, s: v128) -> v128 {
+    transmute(llvm_swizzle(transmute(a), transmute(s)))
 }
 
 /// Creates a vector with identical lanes.
@@ -427,8 +450,9 @@ pub fn v8x16_swizzle(a: v128, s: v128) -> v128 {
 /// Constructs a vector with `x` replicated to all 16 lanes.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.splat))]
-pub fn i8x16_splat(a: i8) -> v128 {
-    unsafe { transmute(i8x16::splat(a)) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_splat(a: i8) -> v128 {
+    transmute(i8x16::splat(a))
 }
 
 /// Creates a vector with identical lanes.
@@ -436,8 +460,9 @@ pub fn i8x16_splat(a: i8) -> v128 {
 /// Construct a vector with `x` replicated to all 8 lanes.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.splat))]
-pub fn i16x8_splat(a: i16) -> v128 {
-    unsafe { transmute(i16x8::splat(a)) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_splat(a: i16) -> v128 {
+    transmute(i16x8::splat(a))
 }
 
 /// Creates a vector with identical lanes.
@@ -445,8 +470,9 @@ pub fn i16x8_splat(a: i16) -> v128 {
 /// Constructs a vector with `x` replicated to all 4 lanes.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.splat))]
-pub fn i32x4_splat(a: i32) -> v128 {
-    unsafe { transmute(i32x4::splat(a)) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_splat(a: i32) -> v128 {
+    transmute(i32x4::splat(a))
 }
 
 /// Creates a vector with identical lanes.
@@ -454,8 +480,9 @@ pub fn i32x4_splat(a: i32) -> v128 {
 /// Construct a vector with `x` replicated to all 2 lanes.
 #[inline]
 #[cfg_attr(test, assert_instr(i64x2.splat))]
-pub fn i64x2_splat(a: i64) -> v128 {
-    unsafe { transmute(i64x2::splat(a)) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_splat(a: i64) -> v128 {
+    transmute(i64x2::splat(a))
 }
 
 /// Creates a vector with identical lanes.
@@ -463,8 +490,9 @@ pub fn i64x2_splat(a: i64) -> v128 {
 /// Constructs a vector with `x` replicated to all 4 lanes.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.splat))]
-pub fn f32x4_splat(a: f32) -> v128 {
-    unsafe { transmute(f32x4::splat(a)) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_splat(a: f32) -> v128 {
+    transmute(f32x4::splat(a))
 }
 
 /// Creates a vector with identical lanes.
@@ -472,8 +500,9 @@ pub fn f32x4_splat(a: f32) -> v128 {
 /// Constructs a vector with `x` replicated to all 2 lanes.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.splat))]
-pub fn f64x2_splat(a: f64) -> v128 {
-    unsafe { transmute(f64x2::splat(a)) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_splat(a: f64) -> v128 {
+    transmute(f64x2::splat(a))
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
@@ -487,20 +516,23 @@ pub fn f64x2_splat(a: f64) -> v128 {
 /// 16.
 #[inline]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i8x16_extract_lane(a: v128, imm: usize) -> i8 {
     simd_extract(a.as_i8x16(), imm as u32)
 }
 
 #[cfg(test)]
 #[assert_instr(i8x16.extract_lane_s)]
-fn i8x16_extract_lane_s(a: v128) -> i32 {
-    unsafe { i8x16_extract_lane(a, 0) as i32 }
+#[target_feature(enable = "simd128")]
+unsafe fn i8x16_extract_lane_s(a: v128) -> i32 {
+    i8x16_extract_lane(a, 0) as i32
 }
 
 #[cfg(test)]
 #[assert_instr(i8x16.extract_lane_u)]
-fn i8x16_extract_lane_u(a: v128) -> u32 {
-    unsafe { i8x16_extract_lane(a, 0) as u8 as u32 }
+#[target_feature(enable = "simd128")]
+unsafe fn i8x16_extract_lane_u(a: v128) -> u32 {
+    i8x16_extract_lane(a, 0) as u8 as u32
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
@@ -515,6 +547,7 @@ fn i8x16_extract_lane_u(a: v128) -> u32 {
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.replace_lane, imm = 0))]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i8x16_replace_lane(a: v128, imm: usize, val: i8) -> v128 {
     transmute(simd_insert(a.as_i8x16(), imm as u32, val))
 }
@@ -530,20 +563,23 @@ pub unsafe fn i8x16_replace_lane(a: v128, imm: usize, val: i8) -> v128 {
 /// 8.
 #[inline]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i16x8_extract_lane(a: v128, imm: usize) -> i16 {
     simd_extract(a.as_i16x8(), imm as u32)
 }
 
 #[cfg(test)]
 #[assert_instr(i16x8.extract_lane_s)]
-fn i16x8_extract_lane_s(a: v128) -> i32 {
-    unsafe { i16x8_extract_lane(a, 0) as i32 }
+#[target_feature(enable = "simd128")]
+unsafe fn i16x8_extract_lane_s(a: v128) -> i32 {
+    i16x8_extract_lane(a, 0) as i32
 }
 
 #[cfg(test)]
 #[assert_instr(i16x8.extract_lane_u)]
-fn i16x8_extract_lane_u(a: v128) -> u32 {
-    unsafe { i16x8_extract_lane(a, 0) as u16 as u32 }
+#[target_feature(enable = "simd128")]
+unsafe fn i16x8_extract_lane_u(a: v128) -> u32 {
+    i16x8_extract_lane(a, 0) as u16 as u32
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
@@ -558,6 +594,7 @@ fn i16x8_extract_lane_u(a: v128) -> u32 {
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.replace_lane, imm = 0))]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i16x8_replace_lane(a: v128, imm: usize, val: i16) -> v128 {
     transmute(simd_insert(a.as_i16x8(), imm as u32, val))
 }
@@ -574,6 +611,7 @@ pub unsafe fn i16x8_replace_lane(a: v128, imm: usize, val: i16) -> v128 {
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.extract_lane, imm = 0))]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i32x4_extract_lane(a: v128, imm: usize) -> i32 {
     simd_extract(a.as_i32x4(), imm as u32)
 }
@@ -590,6 +628,7 @@ pub unsafe fn i32x4_extract_lane(a: v128, imm: usize) -> i32 {
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.replace_lane, imm = 0))]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i32x4_replace_lane(a: v128, imm: usize, val: i32) -> v128 {
     transmute(simd_insert(a.as_i32x4(), imm as u32, val))
 }
@@ -606,6 +645,7 @@ pub unsafe fn i32x4_replace_lane(a: v128, imm: usize, val: i32) -> v128 {
 #[inline]
 #[cfg_attr(test, assert_instr(i64x2.extract_lane, imm = 0))]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i64x2_extract_lane(a: v128, imm: usize) -> i64 {
     simd_extract(a.as_i64x2(), imm as u32)
 }
@@ -622,6 +662,7 @@ pub unsafe fn i64x2_extract_lane(a: v128, imm: usize) -> i64 {
 #[inline]
 #[cfg_attr(test, assert_instr(i64x2.replace_lane, imm = 0))]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn i64x2_replace_lane(a: v128, imm: usize, val: i64) -> v128 {
     transmute(simd_insert(a.as_i64x2(), imm as u32, val))
 }
@@ -638,6 +679,7 @@ pub unsafe fn i64x2_replace_lane(a: v128, imm: usize, val: i64) -> v128 {
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.extract_lane, imm = 0))]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn f32x4_extract_lane(a: v128, imm: usize) -> f32 {
     simd_extract(a.as_f32x4(), imm as u32)
 }
@@ -654,6 +696,7 @@ pub unsafe fn f32x4_extract_lane(a: v128, imm: usize) -> f32 {
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.replace_lane, imm = 0))]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn f32x4_replace_lane(a: v128, imm: usize, val: f32) -> v128 {
     transmute(simd_insert(a.as_f32x4(), imm as u32, val))
 }
@@ -670,6 +713,7 @@ pub unsafe fn f32x4_replace_lane(a: v128, imm: usize, val: f32) -> v128 {
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.extract_lane, imm = 0))]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn f64x2_extract_lane(a: v128, imm: usize) -> f64 {
     simd_extract(a.as_f64x2(), imm as u32)
 }
@@ -686,6 +730,7 @@ pub unsafe fn f64x2_extract_lane(a: v128, imm: usize) -> f64 {
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.replace_lane, imm = 0))]
 #[rustc_args_required_const(1)]
+#[target_feature(enable = "simd128")]
 pub unsafe fn f64x2_replace_lane(a: v128, imm: usize, val: f64) -> v128 {
     transmute(simd_insert(a.as_f64x2(), imm as u32, val))
 }
@@ -697,8 +742,9 @@ pub unsafe fn f64x2_replace_lane(a: v128, imm: usize, val: f64) -> v128 {
 /// were equal, or all zeros if the elements were not equal.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.eq))]
-pub fn i8x16_eq(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_eq::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_eq(a: v128, b: v128) -> v128 {
+    transmute(simd_eq::<_, i8x16>(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
@@ -708,8 +754,9 @@ pub fn i8x16_eq(a: v128, b: v128) -> v128 {
 /// were not equal, or all zeros if the elements were equal.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.ne))]
-pub fn i8x16_ne(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ne::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_ne(a: v128, b: v128) -> v128 {
+    transmute(simd_ne::<_, i8x16>(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
@@ -719,8 +766,9 @@ pub fn i8x16_ne(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.lt_s))]
-pub fn i8x16_lt_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_lt::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_lt_s(a: v128, b: v128) -> v128 {
+    transmute(simd_lt::<_, i8x16>(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
@@ -730,8 +778,9 @@ pub fn i8x16_lt_s(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.lt_u))]
-pub fn i8x16_lt_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_lt::<_, i8x16>(a.as_u8x16(), b.as_u8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_lt_u(a: v128, b: v128) -> v128 {
+    transmute(simd_lt::<_, i8x16>(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
@@ -741,8 +790,9 @@ pub fn i8x16_lt_u(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.gt_s))]
-pub fn i8x16_gt_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_gt::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_gt_s(a: v128, b: v128) -> v128 {
+    transmute(simd_gt::<_, i8x16>(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
@@ -752,8 +802,9 @@ pub fn i8x16_gt_s(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.gt_u))]
-pub fn i8x16_gt_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_gt::<_, i8x16>(a.as_u8x16(), b.as_u8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_gt_u(a: v128, b: v128) -> v128 {
+    transmute(simd_gt::<_, i8x16>(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
@@ -763,8 +814,9 @@ pub fn i8x16_gt_u(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.le_s))]
-pub fn i8x16_le_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_le::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_le_s(a: v128, b: v128) -> v128 {
+    transmute(simd_le::<_, i8x16>(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
@@ -774,8 +826,9 @@ pub fn i8x16_le_s(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.le_u))]
-pub fn i8x16_le_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_le::<_, i8x16>(a.as_u8x16(), b.as_u8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_le_u(a: v128, b: v128) -> v128 {
+    transmute(simd_le::<_, i8x16>(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
@@ -785,8 +838,9 @@ pub fn i8x16_le_u(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.ge_s))]
-pub fn i8x16_ge_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ge::<_, i8x16>(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_ge_s(a: v128, b: v128) -> v128 {
+    transmute(simd_ge::<_, i8x16>(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
@@ -796,8 +850,9 @@ pub fn i8x16_ge_s(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.ge_u))]
-pub fn i8x16_ge_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ge::<_, i8x16>(a.as_u8x16(), b.as_u8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_ge_u(a: v128, b: v128) -> v128 {
+    transmute(simd_ge::<_, i8x16>(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
@@ -807,8 +862,9 @@ pub fn i8x16_ge_u(a: v128, b: v128) -> v128 {
 /// were equal, or all zeros if the elements were not equal.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.eq))]
-pub fn i16x8_eq(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_eq::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_eq(a: v128, b: v128) -> v128 {
+    transmute(simd_eq::<_, i16x8>(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
@@ -818,8 +874,9 @@ pub fn i16x8_eq(a: v128, b: v128) -> v128 {
 /// were not equal, or all zeros if the elements were equal.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.ne))]
-pub fn i16x8_ne(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ne::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_ne(a: v128, b: v128) -> v128 {
+    transmute(simd_ne::<_, i16x8>(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
@@ -829,8 +886,9 @@ pub fn i16x8_ne(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.lt_s))]
-pub fn i16x8_lt_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_lt::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_lt_s(a: v128, b: v128) -> v128 {
+    transmute(simd_lt::<_, i16x8>(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
@@ -840,8 +898,9 @@ pub fn i16x8_lt_s(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.lt_u))]
-pub fn i16x8_lt_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_lt::<_, i16x8>(a.as_u16x8(), b.as_u16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_lt_u(a: v128, b: v128) -> v128 {
+    transmute(simd_lt::<_, i16x8>(a.as_u16x8(), b.as_u16x8()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
@@ -851,8 +910,9 @@ pub fn i16x8_lt_u(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.gt_s))]
-pub fn i16x8_gt_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_gt::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_gt_s(a: v128, b: v128) -> v128 {
+    transmute(simd_gt::<_, i16x8>(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
@@ -862,8 +922,9 @@ pub fn i16x8_gt_s(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.gt_u))]
-pub fn i16x8_gt_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_gt::<_, i16x8>(a.as_u16x8(), b.as_u16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_gt_u(a: v128, b: v128) -> v128 {
+    transmute(simd_gt::<_, i16x8>(a.as_u16x8(), b.as_u16x8()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
@@ -873,8 +934,9 @@ pub fn i16x8_gt_u(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.le_s))]
-pub fn i16x8_le_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_le::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_le_s(a: v128, b: v128) -> v128 {
+    transmute(simd_le::<_, i16x8>(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
@@ -884,8 +946,9 @@ pub fn i16x8_le_s(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.le_u))]
-pub fn i16x8_le_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_le::<_, i16x8>(a.as_u16x8(), b.as_u16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_le_u(a: v128, b: v128) -> v128 {
+    transmute(simd_le::<_, i16x8>(a.as_u16x8(), b.as_u16x8()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
@@ -895,8 +958,9 @@ pub fn i16x8_le_u(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.ge_s))]
-pub fn i16x8_ge_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ge::<_, i16x8>(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_ge_s(a: v128, b: v128) -> v128 {
+    transmute(simd_ge::<_, i16x8>(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 8 sixteen-bit
@@ -906,8 +970,9 @@ pub fn i16x8_ge_s(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.ge_u))]
-pub fn i16x8_ge_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ge::<_, i16x8>(a.as_u16x8(), b.as_u16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_ge_u(a: v128, b: v128) -> v128 {
+    transmute(simd_ge::<_, i16x8>(a.as_u16x8(), b.as_u16x8()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -917,8 +982,9 @@ pub fn i16x8_ge_u(a: v128, b: v128) -> v128 {
 /// were equal, or all zeros if the elements were not equal.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.eq))]
-pub fn i32x4_eq(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_eq::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_eq(a: v128, b: v128) -> v128 {
+    transmute(simd_eq::<_, i32x4>(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -928,8 +994,9 @@ pub fn i32x4_eq(a: v128, b: v128) -> v128 {
 /// were not equal, or all zeros if the elements were equal.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.ne))]
-pub fn i32x4_ne(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ne::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_ne(a: v128, b: v128) -> v128 {
+    transmute(simd_ne::<_, i32x4>(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -939,8 +1006,9 @@ pub fn i32x4_ne(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.lt_s))]
-pub fn i32x4_lt_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_lt::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_lt_s(a: v128, b: v128) -> v128 {
+    transmute(simd_lt::<_, i32x4>(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -950,8 +1018,9 @@ pub fn i32x4_lt_s(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.lt_u))]
-pub fn i32x4_lt_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_lt::<_, i32x4>(a.as_u32x4(), b.as_u32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_lt_u(a: v128, b: v128) -> v128 {
+    transmute(simd_lt::<_, i32x4>(a.as_u32x4(), b.as_u32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -961,8 +1030,9 @@ pub fn i32x4_lt_u(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.gt_s))]
-pub fn i32x4_gt_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_gt::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_gt_s(a: v128, b: v128) -> v128 {
+    transmute(simd_gt::<_, i32x4>(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -972,8 +1042,9 @@ pub fn i32x4_gt_s(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.gt_u))]
-pub fn i32x4_gt_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_gt::<_, i32x4>(a.as_u32x4(), b.as_u32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_gt_u(a: v128, b: v128) -> v128 {
+    transmute(simd_gt::<_, i32x4>(a.as_u32x4(), b.as_u32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -983,8 +1054,9 @@ pub fn i32x4_gt_u(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.le_s))]
-pub fn i32x4_le_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_le::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_le_s(a: v128, b: v128) -> v128 {
+    transmute(simd_le::<_, i32x4>(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -994,8 +1066,9 @@ pub fn i32x4_le_s(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.le_u))]
-pub fn i32x4_le_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_le::<_, i32x4>(a.as_u32x4(), b.as_u32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_le_u(a: v128, b: v128) -> v128 {
+    transmute(simd_le::<_, i32x4>(a.as_u32x4(), b.as_u32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -1005,8 +1078,9 @@ pub fn i32x4_le_u(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.ge_s))]
-pub fn i32x4_ge_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ge::<_, i32x4>(a.as_i32x4(), b.as_i32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_ge_s(a: v128, b: v128) -> v128 {
+    transmute(simd_ge::<_, i32x4>(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -1016,8 +1090,9 @@ pub fn i32x4_ge_s(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.ge_u))]
-pub fn i32x4_ge_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ge::<_, i32x4>(a.as_u32x4(), b.as_u32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_ge_u(a: v128, b: v128) -> v128 {
+    transmute(simd_ge::<_, i32x4>(a.as_u32x4(), b.as_u32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -1027,8 +1102,9 @@ pub fn i32x4_ge_u(a: v128, b: v128) -> v128 {
 /// were equal, or all zeros if the elements were not equal.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.eq))]
-pub fn f32x4_eq(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_eq::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_eq(a: v128, b: v128) -> v128 {
+    transmute(simd_eq::<_, i32x4>(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -1038,8 +1114,9 @@ pub fn f32x4_eq(a: v128, b: v128) -> v128 {
 /// were not equal, or all zeros if the elements were equal.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.ne))]
-pub fn f32x4_ne(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ne::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_ne(a: v128, b: v128) -> v128 {
+    transmute(simd_ne::<_, i32x4>(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -1049,8 +1126,9 @@ pub fn f32x4_ne(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.lt))]
-pub fn f32x4_lt(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_lt::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_lt(a: v128, b: v128) -> v128 {
+    transmute(simd_lt::<_, i32x4>(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -1060,8 +1138,9 @@ pub fn f32x4_lt(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.gt))]
-pub fn f32x4_gt(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_gt::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_gt(a: v128, b: v128) -> v128 {
+    transmute(simd_gt::<_, i32x4>(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -1071,8 +1150,9 @@ pub fn f32x4_gt(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.le))]
-pub fn f32x4_le(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_le::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_le(a: v128, b: v128) -> v128 {
+    transmute(simd_le::<_, i32x4>(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 4 thirty-two-bit
@@ -1082,8 +1162,9 @@ pub fn f32x4_le(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.ge))]
-pub fn f32x4_ge(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ge::<_, i32x4>(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_ge(a: v128, b: v128) -> v128 {
+    transmute(simd_ge::<_, i32x4>(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
@@ -1093,8 +1174,9 @@ pub fn f32x4_ge(a: v128, b: v128) -> v128 {
 /// were equal, or all zeros if the elements were not equal.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.eq))]
-pub fn f64x2_eq(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_eq::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_eq(a: v128, b: v128) -> v128 {
+    transmute(simd_eq::<_, i64x2>(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
@@ -1104,8 +1186,9 @@ pub fn f64x2_eq(a: v128, b: v128) -> v128 {
 /// were not equal, or all zeros if the elements were equal.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.ne))]
-pub fn f64x2_ne(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ne::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_ne(a: v128, b: v128) -> v128 {
+    transmute(simd_ne::<_, i64x2>(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
@@ -1115,8 +1198,9 @@ pub fn f64x2_ne(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.lt))]
-pub fn f64x2_lt(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_lt::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_lt(a: v128, b: v128) -> v128 {
+    transmute(simd_lt::<_, i64x2>(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
@@ -1126,8 +1210,9 @@ pub fn f64x2_lt(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.gt))]
-pub fn f64x2_gt(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_gt::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_gt(a: v128, b: v128) -> v128 {
+    transmute(simd_gt::<_, i64x2>(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
@@ -1137,8 +1222,9 @@ pub fn f64x2_gt(a: v128, b: v128) -> v128 {
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.le))]
-pub fn f64x2_le(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_le::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_le(a: v128, b: v128) -> v128 {
+    transmute(simd_le::<_, i64x2>(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 2 sixty-four-bit
@@ -1148,96 +1234,103 @@ pub fn f64x2_le(a: v128, b: v128) -> v128 {
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.ge))]
-pub fn f64x2_ge(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_ge::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_ge(a: v128, b: v128) -> v128 {
+    transmute(simd_ge::<_, i64x2>(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Flips each bit of the 128-bit input vector.
 #[inline]
 #[cfg_attr(test, assert_instr(v128.not))]
-pub fn v128_not(a: v128) -> v128 {
-    unsafe { transmute(simd_xor(a.as_i64x2(), i64x2(!0, !0))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_not(a: v128) -> v128 {
+    transmute(simd_xor(a.as_i64x2(), i64x2(!0, !0)))
 }
 
 /// Performs a bitwise and of the two input 128-bit vectors, returning the
 /// resulting vector.
 #[inline]
 #[cfg_attr(test, assert_instr(v128.and))]
-pub fn v128_and(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_and(a.as_i64x2(), b.as_i64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_and(a: v128, b: v128) -> v128 {
+    transmute(simd_and(a.as_i64x2(), b.as_i64x2()))
 }
 
 /// Bitwise AND of bits of `a` and the logical inverse of bits of `b`.
 ///
 /// This operation is equivalent to `v128.and(a, v128.not(b))`
 #[inline]
-#[cfg_attr(test, assert_instr(v128.andnot))]
-pub fn v128_andnot(a: v128, b: v128) -> v128 {
-    unsafe {
-        transmute(simd_and(
-            a.as_i64x2(),
-            simd_xor(b.as_i64x2(), i64x2(-1, -1)),
-        ))
-    }
+#[cfg_attr(all(test, all_simd), assert_instr(v128.andnot))]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_andnot(a: v128, b: v128) -> v128 {
+    transmute(simd_and(
+        a.as_i64x2(),
+        simd_xor(b.as_i64x2(), i64x2(-1, -1)),
+    ))
 }
 
 /// Performs a bitwise or of the two input 128-bit vectors, returning the
 /// resulting vector.
 #[inline]
 #[cfg_attr(test, assert_instr(v128.or))]
-pub fn v128_or(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_or(a.as_i64x2(), b.as_i64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_or(a: v128, b: v128) -> v128 {
+    transmute(simd_or(a.as_i64x2(), b.as_i64x2()))
 }
 
 /// Performs a bitwise xor of the two input 128-bit vectors, returning the
 /// resulting vector.
 #[inline]
 #[cfg_attr(test, assert_instr(v128.xor))]
-pub fn v128_xor(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_xor(a.as_i64x2(), b.as_i64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_xor(a: v128, b: v128) -> v128 {
+    transmute(simd_xor(a.as_i64x2(), b.as_i64x2()))
 }
 
 /// Use the bitmask in `c` to select bits from `v1` when 1 and `v2` when 0.
 #[inline]
 #[cfg_attr(test, assert_instr(v128.bitselect))]
-pub fn v128_bitselect(v1: v128, v2: v128, c: v128) -> v128 {
-    unsafe { transmute(llvm_bitselect(v1.as_i8x16(), v2.as_i8x16(), c.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn v128_bitselect(v1: v128, v2: v128, c: v128) -> v128 {
+    transmute(llvm_bitselect(v1.as_i8x16(), v2.as_i8x16(), c.as_i8x16()))
 }
 
 /// Lane-wise wrapping absolute value.
 #[inline]
 // #[cfg_attr(test, assert_instr(i8x16.abs))] // FIXME support not in our LLVM yet
-pub fn i8x16_abs(a: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, i8x16>(a);
-        let zero = i8x16::splat(0);
-        transmute(simd_select::<m8x16, i8x16>(
-            simd_lt(a, zero),
-            simd_sub(zero, a),
-            a,
-        ))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_abs(a: v128) -> v128 {
+    let a = transmute::<_, i8x16>(a);
+    let zero = i8x16::splat(0);
+    transmute(simd_select::<m8x16, i8x16>(
+        simd_lt(a, zero),
+        simd_sub(zero, a),
+        a,
+    ))
 }
 
 /// Negates a 128-bit vectors intepreted as sixteen 8-bit signed integers
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.neg))]
-pub fn i8x16_neg(a: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i8x16(), i8x16::splat(-1))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_neg(a: v128) -> v128 {
+    transmute(simd_mul(a.as_i8x16(), i8x16::splat(-1)))
 }
 
 /// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.any_true))]
-pub fn i8x16_any_true(a: v128) -> i32 {
-    unsafe { llvm_i8x16_any_true(a.as_i8x16()) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_any_true(a: v128) -> i32 {
+    llvm_i8x16_any_true(a.as_i8x16())
 }
 
 /// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.all_true))]
-pub fn i8x16_all_true(a: v128) -> i32 {
-    unsafe { llvm_i8x16_all_true(a.as_i8x16()) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_all_true(a: v128) -> i32 {
+    llvm_i8x16_all_true(a.as_i8x16())
 }
 
 // FIXME: not available in our LLVM yet
@@ -1245,8 +1338,8 @@ pub fn i8x16_all_true(a: v128) -> i32 {
 // /// all bits concatenated.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i8x16.all_true))]
-// pub fn i8x16_bitmask(a: v128) -> i32 {
-//     unsafe { llvm_bitmask_i8x16(transmute(a)) }
+// pub unsafe fn i8x16_bitmask(a: v128) -> i32 {
+//     llvm_bitmask_i8x16(transmute(a))
 // }
 
 /// Converts two input vectors into a smaller lane vector by narrowing each
@@ -1256,8 +1349,9 @@ pub fn i8x16_all_true(a: v128) -> i32 {
 /// interpreted as signed integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.narrow_i16x8_s))]
-pub fn i8x16_narrow_i16x8_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_narrow_i8x16_s(transmute(a), transmute(b))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_narrow_i16x8_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_narrow_i8x16_s(transmute(a), transmute(b)))
 }
 
 /// Converts two input vectors into a smaller lane vector by narrowing each
@@ -1267,8 +1361,9 @@ pub fn i8x16_narrow_i16x8_s(a: v128, b: v128) -> v128 {
 /// interpreted as signed integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.narrow_i16x8_u))]
-pub fn i8x16_narrow_i16x8_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_narrow_i8x16_u(transmute(a), transmute(b))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_narrow_i16x8_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_narrow_i8x16_u(transmute(a), transmute(b)))
 }
 
 /// Shifts each lane to the left by the specified number of bits.
@@ -1277,8 +1372,9 @@ pub fn i8x16_narrow_i16x8_u(a: v128, b: v128) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.shl))]
-pub fn i8x16_shl(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shl(a.as_i8x16(), i8x16::splat(amt as i8))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_shl(a: v128, amt: u32) -> v128 {
+    transmute(simd_shl(a.as_i8x16(), i8x16::splat(amt as i8)))
 }
 
 /// Shifts each lane to the right by the specified number of bits, sign
@@ -1288,8 +1384,9 @@ pub fn i8x16_shl(a: v128, amt: u32) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.shr_s))]
-pub fn i8x16_shr_s(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shr(a.as_i8x16(), i8x16::splat(amt as i8))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_shr_s(a: v128, amt: u32) -> v128 {
+    transmute(simd_shr(a.as_i8x16(), i8x16::splat(amt as i8)))
 }
 
 /// Shifts each lane to the right by the specified number of bits, shifting in
@@ -1299,145 +1396,151 @@ pub fn i8x16_shr_s(a: v128, amt: u32) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.shr_u))]
-pub fn i8x16_shr_u(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shr(a.as_u8x16(), u8x16::splat(amt as u8))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_shr_u(a: v128, amt: u32) -> v128 {
+    transmute(simd_shr(a.as_u8x16(), u8x16::splat(amt as u8)))
 }
 
 /// Adds two 128-bit vectors as if they were two packed sixteen 8-bit integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.add))]
-pub fn i8x16_add(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_add(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_add(a: v128, b: v128) -> v128 {
+    transmute(simd_add(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Adds two 128-bit vectors as if they were two packed sixteen 8-bit signed
 /// integers, saturating on overflow to `i8::MAX`.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.add_saturate_s))]
-pub fn i8x16_add_saturate_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_i8x16_add_saturate_s(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_add_saturate_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i8x16_add_saturate_s(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Adds two 128-bit vectors as if they were two packed sixteen 8-bit unsigned
 /// integers, saturating on overflow to `u8::MAX`.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.add_saturate_u))]
-pub fn i8x16_add_saturate_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_i8x16_add_saturate_u(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_add_saturate_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i8x16_add_saturate_u(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.sub))]
-pub fn i8x16_sub(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_sub(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_sub(a: v128, b: v128) -> v128 {
+    transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit
 /// signed integers, saturating on overflow to `i8::MIN`.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.sub_saturate_s))]
-pub fn i8x16_sub_saturate_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_i8x16_sub_saturate_s(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_sub_saturate_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i8x16_sub_saturate_s(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed sixteen 8-bit
 /// unsigned integers, saturating on overflow to 0.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.sub_saturate_u))]
-pub fn i8x16_sub_saturate_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_i8x16_sub_saturate_u(a.as_i8x16(), b.as_i8x16())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_sub_saturate_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i8x16_sub_saturate_u(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compares lane-wise signed integers, and returns the minimum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.min_s))]
-pub fn i8x16_min_s(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = a.as_i8x16();
-        let b = b.as_i8x16();
-        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_min_s(a: v128, b: v128) -> v128 {
+    let a = a.as_i8x16();
+    let b = b.as_i8x16();
+    transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares lane-wise unsigned integers, and returns the minimum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.min_u))]
-pub fn i8x16_min_u(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, u8x16>(a);
-        let b = transmute::<_, u8x16>(b);
-        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_min_u(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, u8x16>(a);
+    let b = transmute::<_, u8x16>(b);
+    transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares lane-wise signed integers, and returns the maximum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.max_s))]
-pub fn i8x16_max_s(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, i8x16>(a);
-        let b = transmute::<_, i8x16>(b);
-        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_max_s(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, i8x16>(a);
+    let b = transmute::<_, i8x16>(b);
+    transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares lane-wise unsigned integers, and returns the maximum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.max_u))]
-pub fn i8x16_max_u(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, u8x16>(a);
-        let b = transmute::<_, u8x16>(b);
-        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_max_u(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, u8x16>(a);
+    let b = transmute::<_, u8x16>(b);
+    transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
 }
 
 /// Lane-wise rounding average.
 #[inline]
 #[cfg_attr(test, assert_instr(i8x16.avgr_u))]
-pub fn i8x16_avgr_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_avgr_u_i8x16(transmute(a), transmute(b))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i8x16_avgr_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_avgr_u_i8x16(transmute(a), transmute(b)))
 }
 
 /// Lane-wise wrapping absolute value.
 #[inline]
 // #[cfg_attr(test, assert_instr(i16x8.abs))] // FIXME support not in our LLVM yet
-pub fn i16x8_abs(a: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, i16x8>(a);
-        let zero = i16x8::splat(0);
-        transmute(simd_select::<m16x8, i16x8>(
-            simd_lt(a, zero),
-            simd_sub(zero, a),
-            a,
-        ))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_abs(a: v128) -> v128 {
+    let a = transmute::<_, i16x8>(a);
+    let zero = i16x8::splat(0);
+    transmute(simd_select::<m16x8, i16x8>(
+        simd_lt(a, zero),
+        simd_sub(zero, a),
+        a,
+    ))
 }
 
 /// Negates a 128-bit vectors intepreted as eight 16-bit signed integers
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.neg))]
-pub fn i16x8_neg(a: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i16x8(), i16x8::splat(-1))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_neg(a: v128) -> v128 {
+    transmute(simd_mul(a.as_i16x8(), i16x8::splat(-1)))
 }
 
 /// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.any_true))]
-pub fn i16x8_any_true(a: v128) -> i32 {
-    unsafe { llvm_i16x8_any_true(a.as_i16x8()) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_any_true(a: v128) -> i32 {
+    llvm_i16x8_any_true(a.as_i16x8())
 }
 
 /// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.all_true))]
-pub fn i16x8_all_true(a: v128) -> i32 {
-    unsafe { llvm_i16x8_all_true(a.as_i16x8()) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_all_true(a: v128) -> i32 {
+    llvm_i16x8_all_true(a.as_i16x8())
 }
 
 // FIXME: not available in our LLVM yet
@@ -1445,8 +1548,8 @@ pub fn i16x8_all_true(a: v128) -> i32 {
 // /// all bits concatenated.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i16x8.all_true))]
-// pub fn i16x8_bitmask(a: v128) -> i32 {
-//     unsafe { llvm_bitmask_i16x8(transmute(a)) }
+// pub unsafe fn i16x8_bitmask(a: v128) -> i32 {
+//     llvm_bitmask_i16x8(transmute(a))
 // }
 
 /// Converts two input vectors into a smaller lane vector by narrowing each
@@ -1456,8 +1559,9 @@ pub fn i16x8_all_true(a: v128) -> i32 {
 /// interpreted as signed integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.narrow_i32x4_s))]
-pub fn i16x8_narrow_i32x4_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_narrow_i16x8_s(transmute(a), transmute(b))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_narrow_i32x4_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_narrow_i16x8_s(transmute(a), transmute(b)))
 }
 
 /// Converts two input vectors into a smaller lane vector by narrowing each
@@ -1467,8 +1571,9 @@ pub fn i16x8_narrow_i32x4_s(a: v128, b: v128) -> v128 {
 /// interpreted as signed integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.narrow_i32x4_u))]
-pub fn i16x8_narrow_i32x4_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_narrow_i16x8_u(transmute(a), transmute(b))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_narrow_i32x4_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_narrow_i16x8_u(transmute(a), transmute(b)))
 }
 
 // FIXME waiting on a runtime implementation to test
@@ -1476,32 +1581,32 @@ pub fn i16x8_narrow_i32x4_u(a: v128, b: v128) -> v128 {
 // /// vector, sign extended.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_s))]
-// pub fn i16x8_widen_low_i8x16_s(a: v128) -> v128 {
-//     unsafe { transmute(llvm_widen_low_i16x8_s(transmute(a))) }
+// pub unsafe fn i16x8_widen_low_i8x16_s(a: v128) -> v128 {
+//     transmute(llvm_widen_low_i16x8_s(transmute(a)))
 // }
 
 // /// Converts high half of the smaller lane vector to a larger lane
 // /// vector, sign extended.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_s))]
-// pub fn i16x8_widen_high_i8x16_s(a: v128) -> v128 {
-//     unsafe { transmute(llvm_widen_high_i16x8_s(transmute(a))) }
+// pub unsafe fn i16x8_widen_high_i8x16_s(a: v128) -> v128 {
+//     transmute(llvm_widen_high_i16x8_s(transmute(a)))
 // }
 
 // /// Converts low half of the smaller lane vector to a larger lane
 // /// vector, zero extended.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_u))]
-// pub fn i16x8_widen_low_i8x16_u(a: v128) -> v128 {
-//     unsafe { transmute(llvm_widen_low_i16x8_u(transmute(a))) }
+// pub unsafe fn i16x8_widen_low_i8x16_u(a: v128) -> v128 {
+//     transmute(llvm_widen_low_i16x8_u(transmute(a)))
 // }
 
 // /// Converts high half of the smaller lane vector to a larger lane
 // /// vector, zero extended.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_u))]
-// pub fn i16x8_widen_high_i8x16_u(a: v128) -> v128 {
-//     unsafe { transmute(llvm_widen_high_i16x8_u(transmute(a))) }
+// pub unsafe fn i16x8_widen_high_i8x16_u(a: v128) -> v128 {
+//     transmute(llvm_widen_high_i16x8_u(transmute(a)))
 // }
 
 /// Shifts each lane to the left by the specified number of bits.
@@ -1510,8 +1615,9 @@ pub fn i16x8_narrow_i32x4_u(a: v128, b: v128) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.shl))]
-pub fn i16x8_shl(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shl(a.as_i16x8(), i16x8::splat(amt as i16))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_shl(a: v128, amt: u32) -> v128 {
+    transmute(simd_shl(a.as_i16x8(), i16x8::splat(amt as i16)))
 }
 
 /// Shifts each lane to the right by the specified number of bits, sign
@@ -1521,8 +1627,9 @@ pub fn i16x8_shl(a: v128, amt: u32) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.shr_s))]
-pub fn i16x8_shr_s(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(amt as i16))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_shr_s(a: v128, amt: u32) -> v128 {
+    transmute(simd_shr(a.as_i16x8(), i16x8::splat(amt as i16)))
 }
 
 /// Shifts each lane to the right by the specified number of bits, shifting in
@@ -1532,153 +1639,160 @@ pub fn i16x8_shr_s(a: v128, amt: u32) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.shr_u))]
-pub fn i16x8_shr_u(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shr(a.as_u16x8(), u16x8::splat(amt as u16))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_shr_u(a: v128, amt: u32) -> v128 {
+    transmute(simd_shr(a.as_u16x8(), u16x8::splat(amt as u16)))
 }
 
 /// Adds two 128-bit vectors as if they were two packed eight 16-bit integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.add))]
-pub fn i16x8_add(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_add(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_add(a: v128, b: v128) -> v128 {
+    transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Adds two 128-bit vectors as if they were two packed eight 16-bit signed
 /// integers, saturating on overflow to `i16::MAX`.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.add_saturate_s))]
-pub fn i16x8_add_saturate_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_i16x8_add_saturate_s(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_add_saturate_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_add_saturate_s(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Adds two 128-bit vectors as if they were two packed eight 16-bit unsigned
 /// integers, saturating on overflow to `u16::MAX`.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.add_saturate_u))]
-pub fn i16x8_add_saturate_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_i16x8_add_saturate_u(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_add_saturate_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_add_saturate_u(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed eight 16-bit integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.sub))]
-pub fn i16x8_sub(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_sub(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_sub(a: v128, b: v128) -> v128 {
+    transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed eight 16-bit
 /// signed integers, saturating on overflow to `i16::MIN`.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.sub_saturate_s))]
-pub fn i16x8_sub_saturate_s(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_i16x8_sub_saturate_s(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_sub_saturate_s(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_sub_saturate_s(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed eight 16-bit
 /// unsigned integers, saturating on overflow to 0.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.sub_saturate_u))]
-pub fn i16x8_sub_saturate_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_i16x8_sub_saturate_u(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_sub_saturate_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_i16x8_sub_saturate_u(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Multiplies two 128-bit vectors as if they were two packed eight 16-bit
 /// signed integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.mul))]
-pub fn i16x8_mul(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_mul(a: v128, b: v128) -> v128 {
+    transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compares lane-wise signed integers, and returns the minimum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.min_s))]
-pub fn i16x8_min_s(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, i16x8>(a);
-        let b = transmute::<_, i16x8>(b);
-        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_min_s(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, i16x8>(a);
+    let b = transmute::<_, i16x8>(b);
+    transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares lane-wise unsigned integers, and returns the minimum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.min_u))]
-pub fn i16x8_min_u(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, u16x8>(a);
-        let b = transmute::<_, u16x8>(b);
-        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_min_u(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, u16x8>(a);
+    let b = transmute::<_, u16x8>(b);
+    transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares lane-wise signed integers, and returns the maximum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.max_s))]
-pub fn i16x8_max_s(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, i16x8>(a);
-        let b = transmute::<_, i16x8>(b);
-        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_max_s(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, i16x8>(a);
+    let b = transmute::<_, i16x8>(b);
+    transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares lane-wise unsigned integers, and returns the maximum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.max_u))]
-pub fn i16x8_max_u(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, u16x8>(a);
-        let b = transmute::<_, u16x8>(b);
-        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_max_u(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, u16x8>(a);
+    let b = transmute::<_, u16x8>(b);
+    transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
 }
 
 /// Lane-wise rounding average.
 #[inline]
 #[cfg_attr(test, assert_instr(i16x8.avgr_u))]
-pub fn i16x8_avgr_u(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_avgr_u_i16x8(transmute(a), transmute(b))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i16x8_avgr_u(a: v128, b: v128) -> v128 {
+    transmute(llvm_avgr_u_i16x8(transmute(a), transmute(b)))
 }
 
 /// Lane-wise wrapping absolute value.
 #[inline]
 // #[cfg_attr(test, assert_instr(i32x4.abs))] // FIXME support not in our LLVM yet
-pub fn i32x4_abs(a: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, i32x4>(a);
-        let zero = i32x4::splat(0);
-        transmute(simd_select::<m32x4, i32x4>(
-            simd_lt(a, zero),
-            simd_sub(zero, a),
-            a,
-        ))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_abs(a: v128) -> v128 {
+    let a = transmute::<_, i32x4>(a);
+    let zero = i32x4::splat(0);
+    transmute(simd_select::<m32x4, i32x4>(
+        simd_lt(a, zero),
+        simd_sub(zero, a),
+        a,
+    ))
 }
 
 /// Negates a 128-bit vectors intepreted as four 32-bit signed integers
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.neg))]
-pub fn i32x4_neg(a: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i32x4(), i32x4::splat(-1))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_neg(a: v128) -> v128 {
+    transmute(simd_mul(a.as_i32x4(), i32x4::splat(-1)))
 }
 
 /// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.any_true))]
-pub fn i32x4_any_true(a: v128) -> i32 {
-    unsafe { llvm_i32x4_any_true(a.as_i32x4()) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_any_true(a: v128) -> i32 {
+    llvm_i32x4_any_true(a.as_i32x4())
 }
 
 /// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.all_true))]
-pub fn i32x4_all_true(a: v128) -> i32 {
-    unsafe { llvm_i32x4_all_true(a.as_i32x4()) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_all_true(a: v128) -> i32 {
+    llvm_i32x4_all_true(a.as_i32x4())
 }
 
 // FIXME: not available in our LLVM yet
@@ -1686,8 +1800,8 @@ pub fn i32x4_all_true(a: v128) -> i32 {
 // /// all bits concatenated.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i32x4.all_true))]
-// pub fn i32x4_bitmask(a: v128) -> i32 {
-//     unsafe { llvm_bitmask_i32x4(transmute(a)) }
+// pub unsafe fn i32x4_bitmask(a: v128) -> i32 {
+//     llvm_bitmask_i32x4(transmute(a))
 // }
 
 // FIXME waiting on a runtime implementation to test
@@ -1695,32 +1809,32 @@ pub fn i32x4_all_true(a: v128) -> i32 {
 // /// vector, sign extended.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_s))]
-// pub fn i32x4_widen_low_i16x8_s(a: v128) -> v128 {
-//     unsafe { transmute(llvm_widen_low_i32x4_s(transmute(a))) }
+// pub unsafe fn i32x4_widen_low_i16x8_s(a: v128) -> v128 {
+//     transmute(llvm_widen_low_i32x4_s(transmute(a)))
 // }
 
 // /// Converts high half of the smaller lane vector to a larger lane
 // /// vector, sign extended.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_s))]
-// pub fn i32x4_widen_high_i16x8_s(a: v128) -> v128 {
-//     unsafe { transmute(llvm_widen_high_i32x4_s(transmute(a))) }
+// pub unsafe fn i32x4_widen_high_i16x8_s(a: v128) -> v128 {
+//     transmute(llvm_widen_high_i32x4_s(transmute(a)))
 // }
 
 // /// Converts low half of the smaller lane vector to a larger lane
 // /// vector, zero extended.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_u))]
-// pub fn i32x4_widen_low_i16x8_u(a: v128) -> v128 {
-//     unsafe { transmute(llvm_widen_low_i32x4_u(transmute(a))) }
+// pub unsafe fn i32x4_widen_low_i16x8_u(a: v128) -> v128 {
+//     transmute(llvm_widen_low_i32x4_u(transmute(a)))
 // }
 
 // /// Converts high half of the smaller lane vector to a larger lane
 // /// vector, zero extended.
 // #[inline]
 // #[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_u))]
-// pub fn i32x4_widen_high_i16x8_u(a: v128) -> v128 {
-//     unsafe { transmute(llvm_widen_high_i32x4_u(transmute(a))) }
+// pub unsafe fn i32x4_widen_high_i16x8_u(a: v128) -> v128 {
+//     transmute(llvm_widen_high_i32x4_u(transmute(a)))
 // }
 
 /// Shifts each lane to the left by the specified number of bits.
@@ -1729,8 +1843,9 @@ pub fn i32x4_all_true(a: v128) -> i32 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.shl))]
-pub fn i32x4_shl(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shl(a.as_i32x4(), i32x4::splat(amt as i32))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_shl(a: v128, amt: u32) -> v128 {
+    transmute(simd_shl(a.as_i32x4(), i32x4::splat(amt as i32)))
 }
 
 /// Shifts each lane to the right by the specified number of bits, sign
@@ -1740,8 +1855,9 @@ pub fn i32x4_shl(a: v128, amt: u32) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.shr_s))]
-pub fn i32x4_shr_s(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(amt as i32))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_shr_s(a: v128, amt: u32) -> v128 {
+    transmute(simd_shr(a.as_i32x4(), i32x4::splat(amt as i32)))
 }
 
 /// Shifts each lane to the right by the specified number of bits, shifting in
@@ -1751,85 +1867,86 @@ pub fn i32x4_shr_s(a: v128, amt: u32) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.shr_u))]
-pub fn i32x4_shr_u(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shr(a.as_u32x4(), u32x4::splat(amt as u32))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_shr_u(a: v128, amt: u32) -> v128 {
+    transmute(simd_shr(a.as_u32x4(), u32x4::splat(amt as u32)))
 }
 
 /// Adds two 128-bit vectors as if they were two packed four 32-bit integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.add))]
-pub fn i32x4_add(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_add(a.as_i32x4(), b.as_i32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_add(a: v128, b: v128) -> v128 {
+    transmute(simd_add(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed four 32-bit integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.sub))]
-pub fn i32x4_sub(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_sub(a.as_i32x4(), b.as_i32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_sub(a: v128, b: v128) -> v128 {
+    transmute(simd_sub(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Multiplies two 128-bit vectors as if they were two packed four 32-bit
 /// signed integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.mul))]
-pub fn i32x4_mul(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_mul(a: v128, b: v128) -> v128 {
+    transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compares lane-wise signed integers, and returns the minimum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.min_s))]
-pub fn i32x4_min_s(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, i32x4>(a);
-        let b = transmute::<_, i32x4>(b);
-        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_min_s(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, i32x4>(a);
+    let b = transmute::<_, i32x4>(b);
+    transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares lane-wise unsigned integers, and returns the minimum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.min_u))]
-pub fn i32x4_min_u(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, u32x4>(a);
-        let b = transmute::<_, u32x4>(b);
-        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_min_u(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, u32x4>(a);
+    let b = transmute::<_, u32x4>(b);
+    transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
 }
 
 /// Compares lane-wise signed integers, and returns the maximum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.max_s))]
-pub fn i32x4_max_s(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, i32x4>(a);
-        let b = transmute::<_, i32x4>(b);
-        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_max_s(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, i32x4>(a);
+    let b = transmute::<_, i32x4>(b);
+    transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
 }
 
 /// Compares lane-wise unsigned integers, and returns the maximum of
 /// each pair.
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.max_u))]
-pub fn i32x4_max_u(a: v128, b: v128) -> v128 {
-    unsafe {
-        let a = transmute::<_, u32x4>(a);
-        let b = transmute::<_, u32x4>(b);
-        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
-    }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_max_u(a: v128, b: v128) -> v128 {
+    let a = transmute::<_, u32x4>(a);
+    let b = transmute::<_, u32x4>(b);
+    transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
 }
 
 /// Negates a 128-bit vectors intepreted as two 64-bit signed integers
 #[inline]
 #[cfg_attr(test, assert_instr(i64x2.neg))]
-pub fn i64x2_neg(a: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i64x2(), i64x2::splat(-1))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_neg(a: v128) -> v128 {
+    transmute(simd_mul(a.as_i64x2(), i64x2::splat(-1)))
 }
 
 /// Shifts each lane to the left by the specified number of bits.
@@ -1838,8 +1955,9 @@ pub fn i64x2_neg(a: v128) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i64x2.shl))]
-pub fn i64x2_shl(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shl(a.as_i64x2(), i64x2::splat(amt as i64))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_shl(a: v128, amt: u32) -> v128 {
+    transmute(simd_shl(a.as_i64x2(), i64x2::splat(amt as i64)))
 }
 
 /// Shifts each lane to the right by the specified number of bits, sign
@@ -1849,8 +1967,9 @@ pub fn i64x2_shl(a: v128, amt: u32) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i64x2.shr_s))]
-pub fn i64x2_shr_s(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shr(a.as_i64x2(), i64x2::splat(amt as i64))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_shr_s(a: v128, amt: u32) -> v128 {
+    transmute(simd_shr(a.as_i64x2(), i64x2::splat(amt as i64)))
 }
 
 /// Shifts each lane to the right by the specified number of bits, shifting in
@@ -1860,173 +1979,195 @@ pub fn i64x2_shr_s(a: v128, amt: u32) -> v128 {
 /// greater than the lane width.
 #[inline]
 #[cfg_attr(test, assert_instr(i64x2.shr_u))]
-pub fn i64x2_shr_u(a: v128, amt: u32) -> v128 {
-    unsafe { transmute(simd_shr(a.as_u64x2(), u64x2::splat(amt as u64))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_shr_u(a: v128, amt: u32) -> v128 {
+    transmute(simd_shr(a.as_u64x2(), u64x2::splat(amt as u64)))
 }
 
 /// Adds two 128-bit vectors as if they were two packed two 64-bit integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i64x2.add))]
-pub fn i64x2_add(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_add(a: v128, b: v128) -> v128 {
+    transmute(simd_add(a.as_i64x2(), b.as_i64x2()))
 }
 
 /// Subtracts two 128-bit vectors as if they were two packed two 64-bit integers.
 #[inline]
 #[cfg_attr(test, assert_instr(i64x2.sub))]
-pub fn i64x2_sub(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_sub(a: v128, b: v128) -> v128 {
+    transmute(simd_sub(a.as_i64x2(), b.as_i64x2()))
 }
 
 /// Multiplies two 128-bit vectors as if they were two packed two 64-bit integers.
 #[inline]
 // #[cfg_attr(test, assert_instr(i64x2.mul))] // FIXME: not present in our LLVM
-pub fn i64x2_mul(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i64x2(), b.as_i64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i64x2_mul(a: v128, b: v128) -> v128 {
+    transmute(simd_mul(a.as_i64x2(), b.as_i64x2()))
 }
 
 /// Calculates the absolute value of each lane of a 128-bit vector interpreted
 /// as four 32-bit floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.abs))]
-pub fn f32x4_abs(a: v128) -> v128 {
-    unsafe { transmute(llvm_f32x4_abs(a.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_abs(a: v128) -> v128 {
+    transmute(llvm_f32x4_abs(a.as_f32x4()))
 }
 
 /// Negates each lane of a 128-bit vector interpreted as four 32-bit floating
 /// point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.neg))]
-pub fn f32x4_neg(a: v128) -> v128 {
-    unsafe { f32x4_mul(a, transmute(f32x4(-1.0, -1.0, -1.0, -1.0))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_neg(a: v128) -> v128 {
+    f32x4_mul(a, transmute(f32x4(-1.0, -1.0, -1.0, -1.0)))
 }
 
 /// Calculates the square root of each lane of a 128-bit vector interpreted as
 /// four 32-bit floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.sqrt))]
-pub fn f32x4_sqrt(a: v128) -> v128 {
-    unsafe { transmute(llvm_f32x4_sqrt(a.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_sqrt(a: v128) -> v128 {
+    transmute(llvm_f32x4_sqrt(a.as_f32x4()))
 }
 
 /// Adds pairwise lanes of two 128-bit vectors interpreted as four 32-bit
 /// floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.add))]
-pub fn f32x4_add(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_add(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_add(a: v128, b: v128) -> v128 {
+    transmute(simd_add(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Subtracts pairwise lanes of two 128-bit vectors interpreted as four 32-bit
 /// floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.sub))]
-pub fn f32x4_sub(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_sub(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_sub(a: v128, b: v128) -> v128 {
+    transmute(simd_sub(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Multiplies pairwise lanes of two 128-bit vectors interpreted as four 32-bit
 /// floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.mul))]
-pub fn f32x4_mul(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_mul(a: v128, b: v128) -> v128 {
+    transmute(simd_mul(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Divides pairwise lanes of two 128-bit vectors interpreted as four 32-bit
 /// floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.div))]
-pub fn f32x4_div(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_div(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_div(a: v128, b: v128) -> v128 {
+    transmute(simd_div(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Calculates the minimum of pairwise lanes of two 128-bit vectors interpreted
 /// as four 32-bit floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.min))]
-pub fn f32x4_min(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_f32x4_min(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_min(a: v128, b: v128) -> v128 {
+    transmute(llvm_f32x4_min(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Calculates the maximum of pairwise lanes of two 128-bit vectors interpreted
 /// as four 32-bit floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f32x4.max))]
-pub fn f32x4_max(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_f32x4_max(a.as_f32x4(), b.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_max(a: v128, b: v128) -> v128 {
+    transmute(llvm_f32x4_max(a.as_f32x4(), b.as_f32x4()))
 }
 
 /// Calculates the absolute value of each lane of a 128-bit vector interpreted
 /// as two 64-bit floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.abs))]
-pub fn f64x2_abs(a: v128) -> v128 {
-    unsafe { transmute(llvm_f64x2_abs(a.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_abs(a: v128) -> v128 {
+    transmute(llvm_f64x2_abs(a.as_f64x2()))
 }
 
 /// Negates each lane of a 128-bit vector interpreted as two 64-bit floating
 /// point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.neg))]
-pub fn f64x2_neg(a: v128) -> v128 {
-    unsafe { f64x2_mul(a, transmute(f64x2(-1.0, -1.0))) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_neg(a: v128) -> v128 {
+    f64x2_mul(a, transmute(f64x2(-1.0, -1.0)))
 }
 
 /// Calculates the square root of each lane of a 128-bit vector interpreted as
 /// two 64-bit floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.sqrt))]
-pub fn f64x2_sqrt(a: v128) -> v128 {
-    unsafe { transmute(llvm_f64x2_sqrt(a.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_sqrt(a: v128) -> v128 {
+    transmute(llvm_f64x2_sqrt(a.as_f64x2()))
 }
 
 /// Adds pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.add))]
-pub fn f64x2_add(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_add(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_add(a: v128, b: v128) -> v128 {
+    transmute(simd_add(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Subtracts pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.sub))]
-pub fn f64x2_sub(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_sub(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_sub(a: v128, b: v128) -> v128 {
+    transmute(simd_sub(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Multiplies pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.mul))]
-pub fn f64x2_mul(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_mul(a: v128, b: v128) -> v128 {
+    transmute(simd_mul(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Divides pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.div))]
-pub fn f64x2_div(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_div(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_div(a: v128, b: v128) -> v128 {
+    transmute(simd_div(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Calculates the minimum of pairwise lanes of two 128-bit vectors interpreted
 /// as two 64-bit floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.min))]
-pub fn f64x2_min(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_f64x2_min(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_min(a: v128, b: v128) -> v128 {
+    transmute(llvm_f64x2_min(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Calculates the maximum of pairwise lanes of two 128-bit vectors interpreted
 /// as two 64-bit floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr(f64x2.max))]
-pub fn f64x2_max(a: v128, b: v128) -> v128 {
-    unsafe { transmute(llvm_f64x2_max(a.as_f64x2(), b.as_f64x2())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f64x2_max(a: v128, b: v128) -> v128 {
+    transmute(llvm_f64x2_max(a.as_f64x2(), b.as_f64x2()))
 }
 
 /// Converts a 128-bit vector interpreted as four 32-bit floating point numbers
@@ -2036,8 +2177,9 @@ pub fn f64x2_max(a: v128, b: v128) -> v128 {
 /// representable intger.
 #[inline]
 #[cfg_attr(test, assert_instr("i32x4.trunc_sat_f32x4_s"))]
-pub fn i32x4_trunc_sat_f32x4_s(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, i32x4>(a.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_trunc_sat_f32x4_s(a: v128) -> v128 {
+    transmute(simd_cast::<_, i32x4>(a.as_f32x4()))
 }
 
 /// Converts a 128-bit vector interpreted as four 32-bit floating point numbers
@@ -2047,24 +2189,27 @@ pub fn i32x4_trunc_sat_f32x4_s(a: v128) -> v128 {
 /// representable intger.
 #[inline]
 #[cfg_attr(test, assert_instr("i32x4.trunc_sat_f32x4_u"))]
-pub fn i32x4_trunc_sat_f32x4_su(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, u32x4>(a.as_f32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn i32x4_trunc_sat_f32x4_su(a: v128) -> v128 {
+    transmute(simd_cast::<_, u32x4>(a.as_f32x4()))
 }
 
 /// Converts a 128-bit vector interpreted as four 32-bit signed integers into a
 /// 128-bit vector of four 32-bit floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr("f32x4.convert_i32x4_s"))]
-pub fn f32x4_convert_i32x4_s(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, f32x4>(a.as_i32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_convert_i32x4_s(a: v128) -> v128 {
+    transmute(simd_cast::<_, f32x4>(a.as_i32x4()))
 }
 
 /// Converts a 128-bit vector interpreted as four 32-bit unsigned integers into a
 /// 128-bit vector of four 32-bit floating point numbers.
 #[inline]
 #[cfg_attr(test, assert_instr("f32x4.convert_i32x4_u"))]
-pub fn f32x4_convert_i32x4_u(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, f32x4>(a.as_u32x4())) }
+#[target_feature(enable = "simd128")]
+pub unsafe fn f32x4_convert_i32x4_u(a: v128) -> v128 {
+    transmute(simd_cast::<_, f32x4>(a.as_u32x4()))
 }
 
 #[cfg(test)]
@@ -2083,7 +2228,8 @@ pub mod tests {
 
     #[test]
     fn test_v128_const() {
-        const A: v128 = super::v128_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        const A: v128 =
+            unsafe { super::v128_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) };
         compare_bytes(A, A);
     }
 
@@ -2091,11 +2237,11 @@ pub mod tests {
         ($test_id:ident: $val:expr => $($vals:expr),*) => {
             #[test]
             fn $test_id() {
+                unsafe {
                 let a = super::$test_id($val);
-                let b: v128 = unsafe {
-                    transmute([$($vals as u8),*])
-                };
+                let b: v128 = transmute([$($vals as u8),*]);
                 compare_bytes(a, b);
+                }
             }
         }
     }
diff --git a/crates/core_arch/tests/xcrate-macros.rs b/crates/core_arch/tests/xcrate-macros.rs
index 1b32a6c70d..9ab0ea83ae 100644
--- a/crates/core_arch/tests/xcrate-macros.rs
+++ b/crates/core_arch/tests/xcrate-macros.rs
@@ -1,18 +1,20 @@
 #![feature(stdsimd)]
 
 #[test]
-#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+#[cfg(target_arch = "wasm32")]
 fn wut() {
-    use core_arch::arch::wasm32;
-    let a = wasm32::v128_const(0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-    let b = wasm32::v128_const(
-        16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-    );
+    unsafe {
+        use core_arch::arch::wasm32;
+        let a = wasm32::v128_const(0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+        let b = wasm32::v128_const(
+            16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+        );
 
-    let vec_r = unsafe {
-        wasm32::v8x16_shuffle!(a, b, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30,)
-    };
+        let vec_r = wasm32::v8x16_shuffle!(
+            a, b, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30,
+        );
 
-    let e = wasm32::v128_const(0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
-    assert_eq!(wasm32::i8x16_all_true(wasm32::i8x16_eq(e, vec_r)), 1);
+        let e = wasm32::v128_const(0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+        assert_eq!(wasm32::i8x16_all_true(wasm32::i8x16_eq(e, vec_r)), 1);
+    }
 }
diff --git a/crates/stdarch-test/Cargo.toml b/crates/stdarch-test/Cargo.toml
index 2fc42db92a..9eb6b64d16 100644
--- a/crates/stdarch-test/Cargo.toml
+++ b/crates/stdarch-test/Cargo.toml
@@ -11,8 +11,13 @@ lazy_static = "1.0"
 rustc-demangle = "0.1.8"
 cfg-if = "0.1"
 
+# We use a crates.io dependency to disassemble wasm binaries to look for
+# instructions for `#[assert_instr]`. Note that we use an `=` dependency here
+# instead of a floating dependency because the text format for wasm changes over
+# time, and we want to make updates to this explicit rather than automatically
+# picking up updates which might break CI with new instruction names.
 [target.'cfg(target_arch = "wasm32")'.dependencies]
-wasmprinter = "0.2.6"
+wasmprinter = "=0.2.6"
 
 [features]
 default = []
diff --git a/examples/hex.rs b/examples/hex.rs
index d9818d03e5..10b548391f 100644
--- a/examples/hex.rs
+++ b/examples/hex.rs
@@ -12,7 +12,7 @@
 //!
 //! and you should see `746573740a` get printed out.
 
-#![feature(stdsimd)]
+#![feature(stdsimd, wasm_target_feature)]
 #![cfg_attr(test, feature(test))]
 #![allow(
     clippy::result_unwrap_used,
@@ -58,7 +58,7 @@ fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
             return unsafe { hex_encode_sse41(src, dst) };
         }
     }
-    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+    #[cfg(target_arch = "wasm32")]
     {
         if true {
             return unsafe { hex_encode_simd128(src, dst) };
@@ -153,7 +153,8 @@ unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a
     Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
 }
 
-#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+#[cfg(target_arch = "wasm32")]
+#[target_feature(enable = "simd128")]
 unsafe fn hex_encode_simd128<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
     use core_arch::arch::wasm32::*;
 

From 30d19622ec7d3887bb992e1d741405c37ecc189a Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Tue, 14 Jul 2020 05:32:51 -0700
Subject: [PATCH 03/15] Update names of wasm atomic intrinsics

This sync the names of the intrinsics with the current spec, renamed in
January.
---
 crates/core_arch/src/wasm32/atomic.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/crates/core_arch/src/wasm32/atomic.rs b/crates/core_arch/src/wasm32/atomic.rs
index 024bf94a7f..950f565f92 100644
--- a/crates/core_arch/src/wasm32/atomic.rs
+++ b/crates/core_arch/src/wasm32/atomic.rs
@@ -20,7 +20,7 @@ extern "C" {
     fn llvm_atomic_notify(ptr: *mut i32, cnt: i32) -> i32;
 }
 
-/// Corresponding intrinsic to wasm's [`i32.atomic.wait` instruction][instr]
+/// Corresponding intrinsic to wasm's [`memory.atomic.wait32` instruction][instr]
 ///
 /// This function, when called, will block the current thread if the memory
 /// pointed to by `ptr` is equal to `expression` (performing this action
@@ -48,14 +48,14 @@ extern "C" {
 /// library is not obtainable via `rustup`, but rather will require the
 /// standard library to be compiled from source.
 ///
-/// [instr]: https://github.com/WebAssembly/threads/blob/master/proposals/threads/Overview.md#wait
+/// [instr]: https://webassembly.github.io/threads/syntax/instructions.html#syntax-instr-atomic-memory
 #[inline]
 #[cfg_attr(test, assert_instr("i32.atomic.wait"))]
-pub unsafe fn i32_atomic_wait(ptr: *mut i32, expression: i32, timeout_ns: i64) -> i32 {
+pub unsafe fn memory_atomic_wait32(ptr: *mut i32, expression: i32, timeout_ns: i64) -> i32 {
     llvm_atomic_wait_i32(ptr, expression, timeout_ns)
 }
 
-/// Corresponding intrinsic to wasm's [`i64.atomic.wait` instruction][instr]
+/// Corresponding intrinsic to wasm's [`memory.atomic.wait64` instruction][instr]
 ///
 /// This function, when called, will block the current thread if the memory
 /// pointed to by `ptr` is equal to `expression` (performing this action
@@ -83,14 +83,14 @@ pub unsafe fn i32_atomic_wait(ptr: *mut i32, expression: i32, timeout_ns: i64) -
 /// library is not obtainable via `rustup`, but rather will require the
 /// standard library to be compiled from source.
 ///
-/// [instr]: https://github.com/WebAssembly/threads/blob/master/proposals/threads/Overview.md#wait
+/// [instr]: https://webassembly.github.io/threads/syntax/instructions.html#syntax-instr-atomic-memory
 #[inline]
 #[cfg_attr(test, assert_instr("i64.atomic.wait"))]
-pub unsafe fn i64_atomic_wait(ptr: *mut i64, expression: i64, timeout_ns: i64) -> i32 {
+pub unsafe fn memory_atomic_wait64(ptr: *mut i64, expression: i64, timeout_ns: i64) -> i32 {
     llvm_atomic_wait_i64(ptr, expression, timeout_ns)
 }
 
-/// Corresponding intrinsic to wasm's [`atomic.notify` instruction][instr]
+/// Corresponding intrinsic to wasm's [`memory.atomic.notify` instruction][instr]
 ///
 /// This function will notify a number of threads blocked on the address
 /// indicated by `ptr`. Threads previously blocked with the `i32_atomic_wait`
@@ -110,9 +110,9 @@ pub unsafe fn i64_atomic_wait(ptr: *mut i64, expression: i64, timeout_ns: i64) -
 /// library is not obtainable via `rustup`, but rather will require the
 /// standard library to be compiled from source.
 ///
-/// [instr]: https://github.com/WebAssembly/threads/blob/master/proposals/threads/Overview.md#wake
+/// [instr]: https://webassembly.github.io/threads/syntax/instructions.html#syntax-instr-atomic-memory
 #[inline]
 #[cfg_attr(test, assert_instr("atomic.wake"))]
-pub unsafe fn atomic_notify(ptr: *mut i32, waiters: u32) -> u32 {
+pub unsafe fn memory_atomic_notify(ptr: *mut i32, waiters: u32) -> u32 {
     llvm_atomic_notify(ptr, waiters as i32) as u32
 }

From 11f70bf6eb0713f73c50fb944f33098fb7683390 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Tue, 14 Jul 2020 05:57:00 -0700
Subject: [PATCH 04/15] Fix build script output

---
 crates/core_arch/build.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_arch/build.rs b/crates/core_arch/build.rs
index e0c538ceb4..823fb718f2 100644
--- a/crates/core_arch/build.rs
+++ b/crates/core_arch/build.rs
@@ -11,6 +11,6 @@ fn main() {
         .unwrap_or_default()
         .contains("unimplemented-simd128")
     {
-        println!("cargo:rust-cfg:all_simd");
+        println!("cargo:rust-cfg=all_simd");
     }
 }

From 8d7ad156df08d77fc058711f9eb797c6a944e73d Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 15 Jul 2020 09:24:28 -0700
Subject: [PATCH 05/15] Fix issues found when updating libstd's submodule

---
 crates/core_arch/src/lib.rs            | 1 +
 crates/core_arch/src/mod.rs            | 2 --
 crates/core_arch/src/wasm32/simd128.rs | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
index 97c82b5234..7eacd10098 100644
--- a/crates/core_arch/src/lib.rs
+++ b/crates/core_arch/src/lib.rs
@@ -1,4 +1,5 @@
 #![doc(include = "core_arch_docs.md")]
+#![allow(improper_ctypes_definitions)]
 #![allow(dead_code)]
 #![allow(unused_features)]
 #![feature(
diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs
index eda2254e29..e902803e4b 100644
--- a/crates/core_arch/src/mod.rs
+++ b/crates/core_arch/src/mod.rs
@@ -1,7 +1,5 @@
 //! `core_arch`
 
-#![allow(improper_ctypes_definitions)]
-
 #[macro_use]
 mod macros;
 
diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 76be73a98f..11f08e5251 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -362,6 +362,7 @@ pub const unsafe fn v128_const(
 ///
 /// All indexes `$i*` must have the type `u32`.
 #[allow_internal_unstable(platform_intrinsics, rustc_attrs)]
+#[unstable(feature = "stdsimd", issue = "27731")]
 pub macro v8x16_shuffle(
     $a:expr, $b:expr,
     $i0:expr,

From e5729927e01dd62c9bee39ff0a92c43e7dedbf72 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 15 Jul 2020 09:31:24 -0700
Subject: [PATCH 06/15] Tweak a few items in the docs

---
 crates/core_arch/src/mod.rs            | 2 +-
 crates/core_arch/src/wasm32/simd128.rs | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs
index e902803e4b..d66bbede9d 100644
--- a/crates/core_arch/src/mod.rs
+++ b/crates/core_arch/src/mod.rs
@@ -88,7 +88,7 @@ pub mod arch {
     ///
     /// ## SIMD
     ///
-    /// The [simd proposa][simd] for WebAssembly adds a new `v128` type for a
+    /// The [simd proposal][simd] for WebAssembly adds a new `v128` type for a
     /// 128-bit SIMD register. It also adds a large array of instructions to
     /// operate on the `v128` type to perform data processing. The SIMD proposal
     /// has been in progress for quite some time and many instructions have come
diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 11f08e5251..08fb0917ab 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -410,11 +410,13 @@ pub macro v8x16_shuffle(
 
 // internal implementation detail of the `v8x16_shuffle`, done so there's a name
 // that always works for the macro to import.
+#[doc(hidden)]
 pub use crate::mem::transmute as __v8x16_shuffle_transmute;
 
 // internal to this module and only generated here as an implementation detail
 // of the `v8x16_shuffle` macro.
 #[repr(simd)]
+#[doc(hidden)]
 pub struct __v8x16_shuffle_u8x16(
     u8,
     u8,

From 262dd22173ed5fbc8fc0f72529586633b62e3189 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 15 Jul 2020 10:01:00 -0700
Subject: [PATCH 07/15] Update SIMD tracking issue

---
 crates/core_arch/src/wasm32/simd128.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 08fb0917ab..b6bcece0f1 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -3,6 +3,7 @@
 //! [WebAssembly `SIMD128` ISA]:
 //! https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md
 
+#![unstable(feature = "wasm_simd", issue = "74372")]
 #![allow(non_camel_case_types)]
 #![allow(unused_imports)]
 
@@ -362,7 +363,7 @@ pub const unsafe fn v128_const(
 ///
 /// All indexes `$i*` must have the type `u32`.
 #[allow_internal_unstable(platform_intrinsics, rustc_attrs)]
-#[unstable(feature = "stdsimd", issue = "27731")]
+#[unstable(feature = "wasm_simd", issue = "74372")]
 pub macro v8x16_shuffle(
     $a:expr, $b:expr,
     $i0:expr,
@@ -2193,7 +2194,7 @@ pub unsafe fn i32x4_trunc_sat_f32x4_s(a: v128) -> v128 {
 #[inline]
 #[cfg_attr(test, assert_instr("i32x4.trunc_sat_f32x4_u"))]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i32x4_trunc_sat_f32x4_su(a: v128) -> v128 {
+pub unsafe fn i32x4_trunc_sat_f32x4_u(a: v128) -> v128 {
     transmute(simd_cast::<_, u32x4>(a.as_f32x4()))
 }
 

From e04182d3f89fa098f9c28878abe1e27e25ae7833 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 15 Jul 2020 10:48:19 -0700
Subject: [PATCH 08/15] Fix xcrate-macros test

---
 crates/core_arch/tests/xcrate-macros.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_arch/tests/xcrate-macros.rs b/crates/core_arch/tests/xcrate-macros.rs
index 9ab0ea83ae..6b26b5b42f 100644
--- a/crates/core_arch/tests/xcrate-macros.rs
+++ b/crates/core_arch/tests/xcrate-macros.rs
@@ -1,4 +1,4 @@
-#![feature(stdsimd)]
+#![feature(stdsimd, wasm_simd)]
 
 #[test]
 #[cfg(target_arch = "wasm32")]

From 011c9df718d6d1372e4c089f395146cc9abbc609 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 15 Jul 2020 11:05:46 -0700
Subject: [PATCH 09/15] Fixup some stability annotations and feature directives

---
 crates/core_arch/src/lib.rs             | 1 +
 crates/core_arch/tests/xcrate-macros.rs | 3 ++-
 examples/hex.rs                         | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
index 7eacd10098..f21dbb5ce2 100644
--- a/crates/core_arch/src/lib.rs
+++ b/crates/core_arch/src/lib.rs
@@ -38,6 +38,7 @@
     decl_macro
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall, untagged_unions))]
+#![cfg_attr(all(test, target_arch = "wasm32"), feature(wasm_simd))]
 #![deny(clippy::missing_inline_in_public_items)]
 #![allow(
     clippy::inline_always,
diff --git a/crates/core_arch/tests/xcrate-macros.rs b/crates/core_arch/tests/xcrate-macros.rs
index 6b26b5b42f..2b6b46a6e0 100644
--- a/crates/core_arch/tests/xcrate-macros.rs
+++ b/crates/core_arch/tests/xcrate-macros.rs
@@ -1,4 +1,5 @@
-#![feature(stdsimd, wasm_simd)]
+#![feature(stdsimd)]
+#![cfg_attr(target_arch = "wasm32", feature(wasm_simd))]
 
 #[test]
 #[cfg(target_arch = "wasm32")]
diff --git a/examples/hex.rs b/examples/hex.rs
index 10b548391f..1e39eac997 100644
--- a/examples/hex.rs
+++ b/examples/hex.rs
@@ -14,6 +14,7 @@
 
 #![feature(stdsimd, wasm_target_feature)]
 #![cfg_attr(test, feature(test))]
+#![cfg_attr(target_arch = "wasm32", feature(wasm_simd))]
 #![allow(
     clippy::result_unwrap_used,
     clippy::print_stdout,

From 78a11ba266b474d3a7c26a74ef52c88da5294b38 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Wed, 15 Jul 2020 12:21:46 -0700
Subject: [PATCH 10/15] Enable widen instructions

Support for them at runtime was added to Wasmtime
---
 ci/docker/wasm32-wasi/Dockerfile       |  10 +--
 crates/core_arch/src/wasm32/simd128.rs | 114 ++++++++++++-------------
 2 files changed, 58 insertions(+), 66 deletions(-)

diff --git a/ci/docker/wasm32-wasi/Dockerfile b/ci/docker/wasm32-wasi/Dockerfile
index 7b7d75190c..0e151ebbb2 100644
--- a/ci/docker/wasm32-wasi/Dockerfile
+++ b/ci/docker/wasm32-wasi/Dockerfile
@@ -3,15 +3,9 @@ FROM ubuntu:20.04
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update -y && apt-get install -y --no-install-recommends \
   ca-certificates \
-  clang \
-  cmake \
   curl \
-  git \
-  libc6-dev \
-  make \
-  python \
-  python3 \
-  xz-utils
+  xz-utils \
+  clang
 
 RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/dev/wasmtime-dev-x86_64-linux.tar.xz | tar xJf -
 ENV PATH=$PATH:/wasmtime-dev-x86_64-linux
diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index b6bcece0f1..30544ca5a2 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -1580,38 +1580,37 @@ pub unsafe fn i16x8_narrow_i32x4_u(a: v128, b: v128) -> v128 {
     transmute(llvm_narrow_i16x8_u(transmute(a), transmute(b)))
 }
 
-// FIXME waiting on a runtime implementation to test
-// /// Converts low half of the smaller lane vector to a larger lane
-// /// vector, sign extended.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_s))]
-// pub unsafe fn i16x8_widen_low_i8x16_s(a: v128) -> v128 {
-//     transmute(llvm_widen_low_i16x8_s(transmute(a)))
-// }
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_s))]
+pub unsafe fn i16x8_widen_low_i8x16_s(a: v128) -> v128 {
+    transmute(llvm_widen_low_i16x8_s(transmute(a)))
+}
 
-// /// Converts high half of the smaller lane vector to a larger lane
-// /// vector, sign extended.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_s))]
-// pub unsafe fn i16x8_widen_high_i8x16_s(a: v128) -> v128 {
-//     transmute(llvm_widen_high_i16x8_s(transmute(a)))
-// }
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_s))]
+pub unsafe fn i16x8_widen_high_i8x16_s(a: v128) -> v128 {
+    transmute(llvm_widen_high_i16x8_s(transmute(a)))
+}
 
-// /// Converts low half of the smaller lane vector to a larger lane
-// /// vector, zero extended.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_u))]
-// pub unsafe fn i16x8_widen_low_i8x16_u(a: v128) -> v128 {
-//     transmute(llvm_widen_low_i16x8_u(transmute(a)))
-// }
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_u))]
+pub unsafe fn i16x8_widen_low_i8x16_u(a: v128) -> v128 {
+    transmute(llvm_widen_low_i16x8_u(transmute(a)))
+}
 
-// /// Converts high half of the smaller lane vector to a larger lane
-// /// vector, zero extended.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_u))]
-// pub unsafe fn i16x8_widen_high_i8x16_u(a: v128) -> v128 {
-//     transmute(llvm_widen_high_i16x8_u(transmute(a)))
-// }
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_u))]
+pub unsafe fn i16x8_widen_high_i8x16_u(a: v128) -> v128 {
+    transmute(llvm_widen_high_i16x8_u(transmute(a)))
+}
 
 /// Shifts each lane to the left by the specified number of bits.
 ///
@@ -1808,38 +1807,37 @@ pub unsafe fn i32x4_all_true(a: v128) -> i32 {
 //     llvm_bitmask_i32x4(transmute(a))
 // }
 
-// FIXME waiting on a runtime implementation to test
-// /// Converts low half of the smaller lane vector to a larger lane
-// /// vector, sign extended.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_s))]
-// pub unsafe fn i32x4_widen_low_i16x8_s(a: v128) -> v128 {
-//     transmute(llvm_widen_low_i32x4_s(transmute(a)))
-// }
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_s))]
+pub unsafe fn i32x4_widen_low_i16x8_s(a: v128) -> v128 {
+    transmute(llvm_widen_low_i32x4_s(transmute(a)))
+}
 
-// /// Converts high half of the smaller lane vector to a larger lane
-// /// vector, sign extended.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_s))]
-// pub unsafe fn i32x4_widen_high_i16x8_s(a: v128) -> v128 {
-//     transmute(llvm_widen_high_i32x4_s(transmute(a)))
-// }
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, sign extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_s))]
+pub unsafe fn i32x4_widen_high_i16x8_s(a: v128) -> v128 {
+    transmute(llvm_widen_high_i32x4_s(transmute(a)))
+}
 
-// /// Converts low half of the smaller lane vector to a larger lane
-// /// vector, zero extended.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_u))]
-// pub unsafe fn i32x4_widen_low_i16x8_u(a: v128) -> v128 {
-//     transmute(llvm_widen_low_i32x4_u(transmute(a)))
-// }
+/// Converts low half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_u))]
+pub unsafe fn i32x4_widen_low_i16x8_u(a: v128) -> v128 {
+    transmute(llvm_widen_low_i32x4_u(transmute(a)))
+}
 
-// /// Converts high half of the smaller lane vector to a larger lane
-// /// vector, zero extended.
-// #[inline]
-// #[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_u))]
-// pub unsafe fn i32x4_widen_high_i16x8_u(a: v128) -> v128 {
-//     transmute(llvm_widen_high_i32x4_u(transmute(a)))
-// }
+/// Converts high half of the smaller lane vector to a larger lane
+/// vector, zero extended.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_u))]
+pub unsafe fn i32x4_widen_high_i16x8_u(a: v128) -> v128 {
+    transmute(llvm_widen_high_i32x4_u(transmute(a)))
+}
 
 /// Shifts each lane to the left by the specified number of bits.
 ///

From 5dde15862298054c23102953e460c437cf09c0c0 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 16 Jul 2020 12:24:17 -0700
Subject: [PATCH 11/15] Switch from #[rustc_args_required_const] to const
 generics

This commit switches all wasm simd intrinsics to using const generics
instead of the funky `#[rustc_args_required_const]` attribute. This is
ideally a bit more future-proof and more readily expresses the
constraints of these instructions!
---
 crates/core_arch/build.rs               |   3 +-
 crates/core_arch/src/lib.rs             |   2 +
 crates/core_arch/src/wasm32/simd128.rs  | 656 +++++++++++++++---------
 crates/core_arch/tests/xcrate-macros.rs |  21 -
 examples/hex.rs                         |   8 +-
 5 files changed, 408 insertions(+), 282 deletions(-)
 delete mode 100644 crates/core_arch/tests/xcrate-macros.rs

diff --git a/crates/core_arch/build.rs b/crates/core_arch/build.rs
index 823fb718f2..8a347e3f62 100644
--- a/crates/core_arch/build.rs
+++ b/crates/core_arch/build.rs
@@ -7,10 +7,11 @@ fn main() {
     // are available to test their codegen, since some are gated behind an extra
     // `-Ctarget-feature=+unimplemented-simd128` that doesn't have any
     // equivalent in `#[target_feature]` right now.
+    println!("cargo:rerun-if-env-changed=RUSTFLAGS");
     if env::var("RUSTFLAGS")
         .unwrap_or_default()
         .contains("unimplemented-simd128")
     {
-        println!("cargo:rust-cfg=all_simd");
+        println!("cargo:rustc-cfg=all_simd");
     }
 }
diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
index f21dbb5ce2..aa8d4c9820 100644
--- a/crates/core_arch/src/lib.rs
+++ b/crates/core_arch/src/lib.rs
@@ -2,9 +2,11 @@
 #![allow(improper_ctypes_definitions)]
 #![allow(dead_code)]
 #![allow(unused_features)]
+#![allow(incomplete_features)]
 #![feature(
     const_fn,
     const_fn_union,
+    const_generics,
     custom_inner_attributes,
     link_llvm_intrinsics,
     platform_intrinsics,
diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 30544ca5a2..59cba76eda 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -295,57 +295,169 @@ pub unsafe fn v128_store(m: *mut v128, a: v128) {
 /// Materializes a constant SIMD value from the immediate operands.
 ///
 /// The `v128.const` instruction is encoded with 16 immediate bytes
-/// `imm` which provide the bits of the vector directly.
-#[inline]
-#[rustc_args_required_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)]
-#[cfg_attr(all(test, all_simd), assert_instr(
-    v128.const,
-    a0 = 0,
-    a1 = 1,
-    a2 = 2,
-    a3 = 3,
-    a4 = 4,
-    a5 = 5,
-    a6 = 6,
-    a7 = 7,
-    a8 = 8,
-    a9 = 9,
-    a10 = 10,
-    a11 = 11,
-    a12 = 12,
-    a13 = 13,
-    a14 = 14,
-    a15 = 15,
-))]
-#[target_feature(enable = "simd128")]
-pub const unsafe fn v128_const(
-    a0: u8,
-    a1: u8,
-    a2: u8,
-    a3: u8,
-    a4: u8,
-    a5: u8,
-    a6: u8,
-    a7: u8,
-    a8: u8,
-    a9: u8,
-    a10: u8,
-    a11: u8,
-    a12: u8,
-    a13: u8,
-    a14: u8,
-    a15: u8,
-) -> v128 {
-    union U {
-        imm: [u8; 16],
-        vec: v128,
-    }
-    U {
-        imm: [
-            a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15,
-        ],
-    }
-    .vec
+/// which provide the bits of the vector directly.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub const unsafe fn v128_const<
+    const A0: u8,
+    const A1: u8,
+    const A2: u8,
+    const A3: u8,
+    const A4: u8,
+    const A5: u8,
+    const A6: u8,
+    const A7: u8,
+    const A8: u8,
+    const A9: u8,
+    const A10: u8,
+    const A11: u8,
+    const A12: u8,
+    const A13: u8,
+    const A14: u8,
+    const A15: u8,
+>() -> v128 {
+    transmute(u8x16(
+        A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15,
+    ))
+}
+
+#[cfg(all(test, all_simd))]
+#[assert_instr(v128.const)]
+#[target_feature(enable = "simd128")]
+unsafe fn test_v128_const() -> v128 {
+    v128_const::<0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>()
+}
+
+/// Materializes a constant SIMD value from the immediate operands.
+///
+/// This function generates a `v128.const` instruction as if the generated
+/// vector was interpreted as sixteen 8-bit integers.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub const unsafe fn i8x16_const<
+    const A0: i8,
+    const A1: i8,
+    const A2: i8,
+    const A3: i8,
+    const A4: i8,
+    const A5: i8,
+    const A6: i8,
+    const A7: i8,
+    const A8: i8,
+    const A9: i8,
+    const A10: i8,
+    const A11: i8,
+    const A12: i8,
+    const A13: i8,
+    const A14: i8,
+    const A15: i8,
+>() -> v128 {
+    transmute(i8x16(
+        A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15,
+    ))
+}
+
+#[cfg(all(test, all_simd))]
+#[assert_instr(v128.const)]
+#[target_feature(enable = "simd128")]
+unsafe fn test_i8x16_const() -> v128 {
+    i8x16_const::<0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>()
+}
+
+/// Materializes a constant SIMD value from the immediate operands.
+///
+/// This function generates a `v128.const` instruction as if the generated
+/// vector was interpreted as eight 16-bit integers.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub const unsafe fn i16x8_const<
+    const A0: i16,
+    const A1: i16,
+    const A2: i16,
+    const A3: i16,
+    const A4: i16,
+    const A5: i16,
+    const A6: i16,
+    const A7: i16,
+>() -> v128 {
+    transmute(i16x8(A0, A1, A2, A3, A4, A5, A6, A7))
+}
+
+#[cfg(all(test, all_simd))]
+#[assert_instr(v128.const)]
+#[target_feature(enable = "simd128")]
+unsafe fn test_i16x8_const() -> v128 {
+    i16x8_const::<0, 0, 0, 0, 1, 0, 0, 0>()
+}
+
+/// Materializes a constant SIMD value from the immediate operands.
+///
+/// This function generates a `v128.const` instruction as if the generated
+/// vector was interpreted as four 32-bit integers.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub const unsafe fn i32x4_const<const A0: i32, const A1: i32, const A2: i32, const A3: i32>() -> v128
+{
+    transmute(i32x4(A0, A1, A2, A3))
+}
+
+#[cfg(all(test, all_simd))]
+#[assert_instr(v128.const)]
+#[target_feature(enable = "simd128")]
+unsafe fn test_i32x4_const() -> v128 {
+    i32x4_const::<0, 0, 0, 1>()
+}
+
+/// Materializes a constant SIMD value from the immediate operands.
+///
+/// This function generates a `v128.const` instruction as if the generated
+/// vector was interpreted as two 64-bit integers.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub const unsafe fn i64x2_const<const A0: i64, const A1: i64>() -> v128 {
+    transmute(i64x2(A0, A1))
+}
+
+#[cfg(all(test, all_simd))]
+#[assert_instr(v128.const)]
+#[target_feature(enable = "simd128")]
+unsafe fn test_i64x2_const() -> v128 {
+    i64x2_const::<0, 2>()
+}
+
+/// Materializes a constant SIMD value from the immediate operands.
+///
+/// This function generates a `v128.const` instruction as if the generated
+/// vector was interpreted as four 32-bit floats.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub const unsafe fn f32x4_const<const A0: f32, const A1: f32, const A2: f32, const A3: f32>() -> v128
+{
+    transmute(f32x4(A0, A1, A2, A3))
+}
+
+#[cfg(all(test, all_simd))]
+#[assert_instr(v128.const)]
+#[target_feature(enable = "simd128")]
+unsafe fn test_f32x4_const() -> v128 {
+    f32x4_const::<0.0, 1.0, 0.0, 0.0>()
+}
+
+/// Materializes a constant SIMD value from the immediate operands.
+///
+/// This function generates a `v128.const` instruction as if the generated
+/// vector was interpreted as two 64-bit floats.
+#[inline]
+#[target_feature(enable = "simd128")]
+pub const unsafe fn f64x2_const<const A0: f64, const A1: f64>() -> v128 {
+    transmute(f64x2(A0, A1))
+}
+
+#[cfg(all(test, all_simd))]
+#[assert_instr(v128.const)]
+#[target_feature(enable = "simd128")]
+unsafe fn test_f64x2_const() -> v128 {
+    f64x2_const::<0.0, 1.0>()
 }
 
 /// Returns a new vector with lanes selected from the lanes of the two input
@@ -362,80 +474,124 @@ pub const unsafe fn v128_const(
 /// generated if any of the expressions are not constant.
 ///
 /// All indexes `$i*` must have the type `u32`.
-#[allow_internal_unstable(platform_intrinsics, rustc_attrs)]
-#[unstable(feature = "wasm_simd", issue = "74372")]
-pub macro v8x16_shuffle(
-    $a:expr, $b:expr,
-    $i0:expr,
-    $i1:expr,
-    $i2:expr,
-    $i3:expr,
-    $i4:expr,
-    $i5:expr,
-    $i6:expr,
-    $i7:expr,
-    $i8:expr,
-    $i9:expr,
-    $i10:expr,
-    $i11:expr,
-    $i12:expr,
-    $i13:expr,
-    $i14:expr,
-    $i15:expr $(,)?
-) {{
-    extern "platform-intrinsic" {
-        #[rustc_args_required_const(2)]
-        pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
-    }
-    let shuf = simd_shuffle16::<
-        $crate::arch::wasm32::__v8x16_shuffle_u8x16,
-        $crate::arch::wasm32::__v8x16_shuffle_u8x16,
-    >(
-        $crate::arch::wasm32::__v8x16_shuffle_transmute::<
-            $crate::arch::wasm32::v128,
-            $crate::arch::wasm32::__v8x16_shuffle_u8x16,
-        >($a),
-        $crate::arch::wasm32::__v8x16_shuffle_transmute::<
-            $crate::arch::wasm32::v128,
-            $crate::arch::wasm32::__v8x16_shuffle_u8x16,
-        >($b),
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v8x16_shuffle<
+    const I0: u32,
+    const I1: u32,
+    const I2: u32,
+    const I3: u32,
+    const I4: u32,
+    const I5: u32,
+    const I6: u32,
+    const I7: u32,
+    const I8: u32,
+    const I9: u32,
+    const I10: u32,
+    const I11: u32,
+    const I12: u32,
+    const I13: u32,
+    const I14: u32,
+    const I15: u32,
+>(
+    a: v128,
+    b: v128,
+) -> v128 {
+    let shuf = simd_shuffle16::<u8x16, u8x16>(
+        a.as_u8x16(),
+        b.as_u8x16(),
         [
-            $i0, $i1, $i2, $i3, $i4, $i5, $i6, $i7, $i8, $i9, $i10, $i11, $i12, $i13, $i14, $i15,
+            I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15,
         ],
     );
-    $crate::arch::wasm32::__v8x16_shuffle_transmute::<
-        $crate::arch::wasm32::__v8x16_shuffle_u8x16,
-        $crate::arch::wasm32::v128,
-    >(shuf)
-}}
-
-// internal implementation detail of the `v8x16_shuffle`, done so there's a name
-// that always works for the macro to import.
-#[doc(hidden)]
-pub use crate::mem::transmute as __v8x16_shuffle_transmute;
-
-// internal to this module and only generated here as an implementation detail
-// of the `v8x16_shuffle` macro.
-#[repr(simd)]
-#[doc(hidden)]
-pub struct __v8x16_shuffle_u8x16(
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-    u8,
-);
+    transmute(shuf)
+}
+
+#[cfg(test)]
+#[assert_instr(v8x16.shuffle)]
+#[target_feature(enable = "simd128")]
+unsafe fn v8x16_shuffle_test(a: v128, b: v128) -> v128 {
+    v8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>(a, b)
+}
+
+/// Same as [`v8x16_shuffle`], except operates as if the inputs were eight
+/// 16-bit integers, only taking 8 indices to shuffle.
+///
+/// Indices in the range [0, 7] select from `a` while [8, 15] select from `b`.
+/// Note that this will generate the `v8x16.shuffle` instruction, since there
+/// is no native `v16x8.shuffle` instruction (there is no need for one since
+/// `v8x16.shuffle` suffices).
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v16x8_shuffle<
+    const I0: u32,
+    const I1: u32,
+    const I2: u32,
+    const I3: u32,
+    const I4: u32,
+    const I5: u32,
+    const I6: u32,
+    const I7: u32,
+>(
+    a: v128,
+    b: v128,
+) -> v128 {
+    let shuf =
+        simd_shuffle8::<u16x8, u16x8>(a.as_u16x8(), b.as_u16x8(), [I0, I1, I2, I3, I4, I5, I6, I7]);
+    transmute(shuf)
+}
+
+#[cfg(test)]
+#[assert_instr(v8x16.shuffle)]
+#[target_feature(enable = "simd128")]
+unsafe fn v16x8_shuffle_test(a: v128, b: v128) -> v128 {
+    v16x8_shuffle::<0, 2, 4, 6, 8, 10, 12, 14>(a, b)
+}
+
+/// Same as [`v8x16_shuffle`], except operates as if the inputs were four
+/// 32-bit integers, only taking 4 indices to shuffle.
+///
+/// Indices in the range [0, 3] select from `a` while [4, 7] select from `b`.
+/// Note that this will generate the `v8x16.shuffle` instruction, since there
+/// is no native `v32x4.shuffle` instruction (there is no need for one since
+/// `v8x16.shuffle` suffices).
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v32x4_shuffle<const I0: u32, const I1: u32, const I2: u32, const I3: u32>(
+    a: v128,
+    b: v128,
+) -> v128 {
+    let shuf = simd_shuffle4::<u32x4, u32x4>(a.as_u32x4(), b.as_u32x4(), [I0, I1, I2, I3]);
+    transmute(shuf)
+}
+
+#[cfg(test)]
+#[assert_instr(v8x16.shuffle)]
+#[target_feature(enable = "simd128")]
+unsafe fn v32x4_shuffle_test(a: v128, b: v128) -> v128 {
+    v32x4_shuffle::<0, 2, 4, 6>(a, b)
+}
+
+/// Same as [`v8x16_shuffle`], except operates as if the inputs were two
+/// 64-bit integers, only taking 2 indices to shuffle.
+///
+/// Indices in the range [0, 1] select from `a` while [2, 3] select from `b`.
+/// Note that this will generate the `v8x16.shuffle` instruction, since there
+/// is no native `v64x2.shuffle` instruction (there is no need for one since
+/// `v8x16.shuffle` suffices).
+#[inline]
+#[target_feature(enable = "simd128")]
+pub unsafe fn v64x2_shuffle<const I0: u32, const I1: u32>(a: v128, b: v128) -> v128 {
+    let shuf = simd_shuffle2::<u64x2, u64x2>(a.as_u64x2(), b.as_u64x2(), [I0, I1]);
+    transmute(shuf)
+}
+
+#[cfg(test)]
+#[assert_instr(v8x16.shuffle)]
+#[target_feature(enable = "simd128")]
+unsafe fn v64x2_shuffle_test(a: v128, b: v128) -> v128 {
+    v64x2_shuffle::<0, 2>(a, b)
+}
 
 /// Returns a new vector with lanes selected from the lanes of the first input
 /// vector `a` specified in the second input vector `s`.
@@ -512,231 +668,219 @@ pub unsafe fn f64x2_splat(a: f64) -> v128 {
 /// Extracts a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
-/// `imm` from `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 16.
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_extract_lane(a: v128, imm: usize) -> i8 {
-    simd_extract(a.as_i8x16(), imm as u32)
+pub unsafe fn i8x16_extract_lane<const N: u32>(a: v128) -> i8 {
+    simd_extract(a.as_i8x16(), N)
 }
 
 #[cfg(test)]
 #[assert_instr(i8x16.extract_lane_s)]
 #[target_feature(enable = "simd128")]
 unsafe fn i8x16_extract_lane_s(a: v128) -> i32 {
-    i8x16_extract_lane(a, 0) as i32
+    i8x16_extract_lane::<0>(a) as i32
 }
 
 #[cfg(test)]
 #[assert_instr(i8x16.extract_lane_u)]
 #[target_feature(enable = "simd128")]
 unsafe fn i8x16_extract_lane_u(a: v128) -> u32 {
-    i8x16_extract_lane(a, 0) as u8 as u32
+    i8x16_extract_lane::<0>(a) as u8 as u32
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
 ///
 /// Replaces the scalar value of lane specified in the immediate mode operand
-/// `imm` with `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 16.
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[cfg_attr(test, assert_instr(i8x16.replace_lane, imm = 0))]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_replace_lane(a: v128, imm: usize, val: i8) -> v128 {
-    transmute(simd_insert(a.as_i8x16(), imm as u32, val))
+pub unsafe fn i8x16_replace_lane<const N: u32>(a: v128, val: i8) -> v128 {
+    transmute(simd_insert(a.as_i8x16(), N, val))
+}
+
+#[cfg(test)]
+#[assert_instr(i8x16.replace_lane)]
+#[target_feature(enable = "simd128")]
+unsafe fn i8x16_replace_lane_test(a: v128, val: i8) -> v128 {
+    i8x16_replace_lane::<0>(a, val)
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
 ///
 /// Extracts a the scalar value of lane specified in the immediate mode operand
-/// `imm` from `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 8.
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_extract_lane(a: v128, imm: usize) -> i16 {
-    simd_extract(a.as_i16x8(), imm as u32)
+pub unsafe fn i16x8_extract_lane<const N: u32>(a: v128) -> i16 {
+    simd_extract(a.as_i16x8(), N)
 }
 
 #[cfg(test)]
 #[assert_instr(i16x8.extract_lane_s)]
 #[target_feature(enable = "simd128")]
 unsafe fn i16x8_extract_lane_s(a: v128) -> i32 {
-    i16x8_extract_lane(a, 0) as i32
+    i16x8_extract_lane::<0>(a) as i32
 }
 
 #[cfg(test)]
 #[assert_instr(i16x8.extract_lane_u)]
 #[target_feature(enable = "simd128")]
 unsafe fn i16x8_extract_lane_u(a: v128) -> u32 {
-    i16x8_extract_lane(a, 0) as u16 as u32
+    i16x8_extract_lane::<0>(a) as u16 as u32
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
 ///
 /// Replaces the scalar value of lane specified in the immediate mode operand
-/// `imm` with `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 8.
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[cfg_attr(test, assert_instr(i16x8.replace_lane, imm = 0))]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_replace_lane(a: v128, imm: usize, val: i16) -> v128 {
-    transmute(simd_insert(a.as_i16x8(), imm as u32, val))
+pub unsafe fn i16x8_replace_lane<const N: u32>(a: v128, val: i16) -> v128 {
+    transmute(simd_insert(a.as_i16x8(), N, val))
+}
+
+#[cfg(test)]
+#[assert_instr(i16x8.replace_lane)]
+#[target_feature(enable = "simd128")]
+unsafe fn i16x8_replace_lane_test(a: v128, val: i16) -> v128 {
+    i16x8_replace_lane::<0>(a, val)
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 4 packed i32 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
-/// `imm` from `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 4.
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[cfg_attr(test, assert_instr(i32x4.extract_lane, imm = 0))]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i32x4_extract_lane(a: v128, imm: usize) -> i32 {
-    simd_extract(a.as_i32x4(), imm as u32)
+pub unsafe fn i32x4_extract_lane<const N: u32>(a: v128) -> i32 {
+    simd_extract(a.as_i32x4(), N)
+}
+
+#[cfg(test)]
+#[assert_instr(i32x4.extract_lane)]
+#[target_feature(enable = "simd128")]
+unsafe fn i32x4_extract_lane_test(a: v128) -> i32 {
+    i32x4_extract_lane::<0>(a)
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 4 packed i32 numbers.
 ///
 /// Replaces the scalar value of lane specified in the immediate mode operand
-/// `imm` with `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 4.
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[cfg_attr(test, assert_instr(i32x4.replace_lane, imm = 0))]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i32x4_replace_lane(a: v128, imm: usize, val: i32) -> v128 {
-    transmute(simd_insert(a.as_i32x4(), imm as u32, val))
+pub unsafe fn i32x4_replace_lane<const N: u32>(a: v128, val: i32) -> v128 {
+    transmute(simd_insert(a.as_i32x4(), N, val))
+}
+
+#[cfg(test)]
+#[assert_instr(i32x4.replace_lane)]
+#[target_feature(enable = "simd128")]
+unsafe fn i32x4_replace_lane_test(a: v128, val: i32) -> v128 {
+    i32x4_replace_lane::<0>(a, val)
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 2 packed i64 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
-/// `imm` from `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 2.
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[cfg_attr(test, assert_instr(i64x2.extract_lane, imm = 0))]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i64x2_extract_lane(a: v128, imm: usize) -> i64 {
-    simd_extract(a.as_i64x2(), imm as u32)
+pub unsafe fn i64x2_extract_lane<const N: u32>(a: v128) -> i64 {
+    simd_extract(a.as_i64x2(), N)
+}
+
+#[cfg(test)]
+#[assert_instr(i64x2.extract_lane)]
+#[target_feature(enable = "simd128")]
+unsafe fn i64x2_extract_lane_test(a: v128) -> i64 {
+    i64x2_extract_lane::<0>(a)
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 2 packed i64 numbers.
 ///
 /// Replaces the scalar value of lane specified in the immediate mode operand
-/// `imm` with `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 2.
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[cfg_attr(test, assert_instr(i64x2.replace_lane, imm = 0))]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i64x2_replace_lane(a: v128, imm: usize, val: i64) -> v128 {
-    transmute(simd_insert(a.as_i64x2(), imm as u32, val))
+pub unsafe fn i64x2_replace_lane<const N: u32>(a: v128, val: i64) -> v128 {
+    transmute(simd_insert(a.as_i64x2(), N, val))
+}
+
+#[cfg(test)]
+#[assert_instr(i64x2.replace_lane)]
+#[target_feature(enable = "simd128")]
+unsafe fn i64x2_replace_lane_test(a: v128, val: i64) -> v128 {
+    i64x2_replace_lane::<0>(a, val)
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 4 packed f32 numbers.
 ///
-/// Extracts the scalar value of lane specified in the immediate mode operand
-/// `imm` from `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 4.
+/// Extracts the scalar value of lane specified fn the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[cfg_attr(test, assert_instr(f32x4.extract_lane, imm = 0))]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn f32x4_extract_lane(a: v128, imm: usize) -> f32 {
-    simd_extract(a.as_f32x4(), imm as u32)
+pub unsafe fn f32x4_extract_lane<const N: u32>(a: v128) -> f32 {
+    simd_extract(a.as_f32x4(), N)
+}
+
+#[cfg(test)]
+#[assert_instr(f32x4.extract_lane)]
+#[target_feature(enable = "simd128")]
+unsafe fn f32x4_extract_lane_test(a: v128) -> f32 {
+    f32x4_extract_lane::<0>(a)
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 4 packed f32 numbers.
 ///
-/// Replaces the scalar value of lane specified in the immediate mode operand
-/// `imm` with `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 4.
+/// Replaces the scalar value of lane specified fn the immediate mode operand
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[cfg_attr(test, assert_instr(f32x4.replace_lane, imm = 0))]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn f32x4_replace_lane(a: v128, imm: usize, val: f32) -> v128 {
-    transmute(simd_insert(a.as_f32x4(), imm as u32, val))
+pub unsafe fn f32x4_replace_lane<const N: u32>(a: v128, val: f32) -> v128 {
+    transmute(simd_insert(a.as_f32x4(), N, val))
 }
 
-/// Extracts lane from a 128-bit vector interpreted as 2 packed f64 numbers.
-///
-/// Extracts the scalar value of lane specified in the immediate mode operand
-/// `imm` from `a`.
-///
-/// # Unsafety
+#[cfg(test)]
+#[assert_instr(f32x4.replace_lane)]
+#[target_feature(enable = "simd128")]
+unsafe fn f32x4_replace_lane_test(a: v128, val: f32) -> v128 {
+    f32x4_replace_lane::<0>(a, val)
+}
+
+/// Extracts a lane from a 128-bit vector interpreted as 2 packed f64 numbers.
 ///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 2.
+/// Extracts the scalar value of lane specified fn the immediate mode operand
+/// `N` from `a`. If `N` fs out of bounds then it is a compile time error.
 #[inline]
-#[cfg_attr(test, assert_instr(f64x2.extract_lane, imm = 0))]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn f64x2_extract_lane(a: v128, imm: usize) -> f64 {
-    simd_extract(a.as_f64x2(), imm as u32)
+pub unsafe fn f64x2_extract_lane<const N: u32>(a: v128) -> f64 {
+    simd_extract(a.as_f64x2(), N)
+}
+
+#[cfg(test)]
+#[assert_instr(f64x2.extract_lane)]
+#[target_feature(enable = "simd128")]
+unsafe fn f64x2_extract_lane_test(a: v128) -> f64 {
+    f64x2_extract_lane::<0>(a)
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 2 packed f64 numbers.
 ///
 /// Replaces the scalar value of lane specified in the immediate mode operand
-/// `imm` with `a`.
-///
-/// # Unsafety
-///
-/// This function has undefined behavior if `imm` is greater than or equal to
-/// 2.
+/// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
-#[cfg_attr(test, assert_instr(f64x2.replace_lane, imm = 0))]
-#[rustc_args_required_const(1)]
 #[target_feature(enable = "simd128")]
-pub unsafe fn f64x2_replace_lane(a: v128, imm: usize, val: f64) -> v128 {
-    transmute(simd_insert(a.as_f64x2(), imm as u32, val))
+pub unsafe fn f64x2_replace_lane<const N: u32>(a: v128, val: f64) -> v128 {
+    transmute(simd_insert(a.as_f64x2(), N, val))
+}
+
+#[cfg(test)]
+#[assert_instr(f64x2.replace_lane)]
+#[target_feature(enable = "simd128")]
+unsafe fn f64x2_replace_lane_test(a: v128, val: f64) -> v128 {
+    f64x2_replace_lane::<0>(a, val)
 }
 
 /// Compares two 128-bit vectors as if they were two vectors of 16 eight-bit
@@ -2231,7 +2375,7 @@ pub mod tests {
     #[test]
     fn test_v128_const() {
         const A: v128 =
-            unsafe { super::v128_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) };
+            unsafe { super::v128_const::<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15>() };
         compare_bytes(A, A);
     }
 
@@ -2271,7 +2415,7 @@ pub mod tests {
                     let arr: [$elem; $count] = [123 as $elem; $count];
                     let vec: v128 = transmute(arr);
                     $(
-                        assert_eq!($extract(vec, $idx), 123 as $elem);
+                        assert_eq!($extract::<$idx>(vec), 123 as $elem);
                     )*
 
                     // create a vector from array and check that the indices contain
@@ -2279,10 +2423,10 @@ pub mod tests {
                     let arr: [$elem; $count] = [$($idx as $elem),*];
                     let vec: v128 = transmute(arr);
                     $(
-                        assert_eq!($extract(vec, $idx), $idx as $elem);
+                        assert_eq!($extract::<$idx>(vec), $idx as $elem);
 
-                        let tmp = $replace(vec, $idx, 124 as $elem);
-                        assert_eq!($extract(tmp, $idx), 124 as $elem);
+                        let tmp = $replace::<$idx>(vec, 124 as $elem);
+                        assert_eq!($extract::<$idx>(tmp), 124 as $elem);
                     )*
                 }
             }
@@ -2503,7 +2647,7 @@ pub mod tests {
     }
 
     #[test]
-    fn v8x16_shuffle() {
+    fn test_v8x16_shuffle() {
         unsafe {
             let a = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
             let b = [
@@ -2513,8 +2657,8 @@ pub mod tests {
             let vec_a: v128 = transmute(a);
             let vec_b: v128 = transmute(b);
 
-            let vec_r = v8x16_shuffle!(
-                vec_a, vec_b, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30,
+            let vec_r = v8x16_shuffle::<0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30>(
+                vec_a, vec_b,
             );
 
             let e = [0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30];
diff --git a/crates/core_arch/tests/xcrate-macros.rs b/crates/core_arch/tests/xcrate-macros.rs
deleted file mode 100644
index 2b6b46a6e0..0000000000
--- a/crates/core_arch/tests/xcrate-macros.rs
+++ /dev/null
@@ -1,21 +0,0 @@
-#![feature(stdsimd)]
-#![cfg_attr(target_arch = "wasm32", feature(wasm_simd))]
-
-#[test]
-#[cfg(target_arch = "wasm32")]
-fn wut() {
-    unsafe {
-        use core_arch::arch::wasm32;
-        let a = wasm32::v128_const(0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-        let b = wasm32::v128_const(
-            16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-        );
-
-        let vec_r = wasm32::v8x16_shuffle!(
-            a, b, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30,
-        );
-
-        let e = wasm32::v128_const(0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
-        assert_eq!(wasm32::i8x16_all_true(wasm32::i8x16_eq(e, vec_r)), 1);
-    }
-}
diff --git a/examples/hex.rs b/examples/hex.rs
index 1e39eac997..43826989b5 100644
--- a/examples/hex.rs
+++ b/examples/hex.rs
@@ -183,11 +183,11 @@ unsafe fn hex_encode_simd128<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'
         // original source text order. The first element (res1) we'll store uses
         // all the low bytes from the 2 masks and the second element (res2) uses
         // all the upper bytes.
-        let res1 = v8x16_shuffle!(
-            masked2, masked1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+        let res1 = v8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+            masked2, masked1,
         );
-        let res2 = v8x16_shuffle!(
-            masked2, masked1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+        let res2 = v8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+            masked2, masked1,
         );
 
         v128_store(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);

From bfae7eadafc1e43fb8f6f77a0d9810a6f96f1e1d Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Fri, 17 Jul 2020 07:16:38 -0700
Subject: [PATCH 12/15] Switch const-level indexing to use `usize`

---
 crates/core_arch/src/wasm32/simd128.rs | 121 ++++++++++++++-----------
 1 file changed, 66 insertions(+), 55 deletions(-)

diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 59cba76eda..099d664854 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -477,22 +477,22 @@ unsafe fn test_f64x2_const() -> v128 {
 #[inline]
 #[target_feature(enable = "simd128")]
 pub unsafe fn v8x16_shuffle<
-    const I0: u32,
-    const I1: u32,
-    const I2: u32,
-    const I3: u32,
-    const I4: u32,
-    const I5: u32,
-    const I6: u32,
-    const I7: u32,
-    const I8: u32,
-    const I9: u32,
-    const I10: u32,
-    const I11: u32,
-    const I12: u32,
-    const I13: u32,
-    const I14: u32,
-    const I15: u32,
+    const I0: usize,
+    const I1: usize,
+    const I2: usize,
+    const I3: usize,
+    const I4: usize,
+    const I5: usize,
+    const I6: usize,
+    const I7: usize,
+    const I8: usize,
+    const I9: usize,
+    const I10: usize,
+    const I11: usize,
+    const I12: usize,
+    const I13: usize,
+    const I14: usize,
+    const I15: usize,
 >(
     a: v128,
     b: v128,
@@ -501,7 +501,9 @@ pub unsafe fn v8x16_shuffle<
         a.as_u8x16(),
         b.as_u8x16(),
         [
-            I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15,
+            I0 as u32, I1 as u32, I2 as u32, I3 as u32, I4 as u32, I5 as u32, I6 as u32, I7 as u32,
+            I8 as u32, I9 as u32, I10 as u32, I11 as u32, I12 as u32, I13 as u32, I14 as u32,
+            I15 as u32,
         ],
     );
     transmute(shuf)
@@ -524,20 +526,25 @@ unsafe fn v8x16_shuffle_test(a: v128, b: v128) -> v128 {
 #[inline]
 #[target_feature(enable = "simd128")]
 pub unsafe fn v16x8_shuffle<
-    const I0: u32,
-    const I1: u32,
-    const I2: u32,
-    const I3: u32,
-    const I4: u32,
-    const I5: u32,
-    const I6: u32,
-    const I7: u32,
+    const I0: usize,
+    const I1: usize,
+    const I2: usize,
+    const I3: usize,
+    const I4: usize,
+    const I5: usize,
+    const I6: usize,
+    const I7: usize,
 >(
     a: v128,
     b: v128,
 ) -> v128 {
-    let shuf =
-        simd_shuffle8::<u16x8, u16x8>(a.as_u16x8(), b.as_u16x8(), [I0, I1, I2, I3, I4, I5, I6, I7]);
+    let shuf = simd_shuffle8::<u16x8, u16x8>(
+        a.as_u16x8(),
+        b.as_u16x8(),
+        [
+            I0 as u32, I1 as u32, I2 as u32, I3 as u32, I4 as u32, I5 as u32, I6 as u32, I7 as u32,
+        ],
+    );
     transmute(shuf)
 }
 
@@ -557,11 +564,15 @@ unsafe fn v16x8_shuffle_test(a: v128, b: v128) -> v128 {
 /// `v8x16.shuffle` suffices).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn v32x4_shuffle<const I0: u32, const I1: u32, const I2: u32, const I3: u32>(
+pub unsafe fn v32x4_shuffle<const I0: usize, const I1: usize, const I2: usize, const I3: usize>(
     a: v128,
     b: v128,
 ) -> v128 {
-    let shuf = simd_shuffle4::<u32x4, u32x4>(a.as_u32x4(), b.as_u32x4(), [I0, I1, I2, I3]);
+    let shuf = simd_shuffle4::<u32x4, u32x4>(
+        a.as_u32x4(),
+        b.as_u32x4(),
+        [I0 as u32, I1 as u32, I2 as u32, I3 as u32],
+    );
     transmute(shuf)
 }
 
@@ -581,8 +592,8 @@ unsafe fn v32x4_shuffle_test(a: v128, b: v128) -> v128 {
 /// `v8x16.shuffle` suffices).
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn v64x2_shuffle<const I0: u32, const I1: u32>(a: v128, b: v128) -> v128 {
-    let shuf = simd_shuffle2::<u64x2, u64x2>(a.as_u64x2(), b.as_u64x2(), [I0, I1]);
+pub unsafe fn v64x2_shuffle<const I0: usize, const I1: usize>(a: v128, b: v128) -> v128 {
+    let shuf = simd_shuffle2::<u64x2, u64x2>(a.as_u64x2(), b.as_u64x2(), [I0 as u32, I1 as u32]);
     transmute(shuf)
 }
 
@@ -671,8 +682,8 @@ pub unsafe fn f64x2_splat(a: f64) -> v128 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_extract_lane<const N: u32>(a: v128) -> i8 {
-    simd_extract(a.as_i8x16(), N)
+pub unsafe fn i8x16_extract_lane<const N: usize>(a: v128) -> i8 {
+    simd_extract(a.as_i8x16(), N as u32)
 }
 
 #[cfg(test)]
@@ -695,8 +706,8 @@ unsafe fn i8x16_extract_lane_u(a: v128) -> u32 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i8x16_replace_lane<const N: u32>(a: v128, val: i8) -> v128 {
-    transmute(simd_insert(a.as_i8x16(), N, val))
+pub unsafe fn i8x16_replace_lane<const N: usize>(a: v128, val: i8) -> v128 {
+    transmute(simd_insert(a.as_i8x16(), N as u32, val))
 }
 
 #[cfg(test)]
@@ -712,8 +723,8 @@ unsafe fn i8x16_replace_lane_test(a: v128, val: i8) -> v128 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_extract_lane<const N: u32>(a: v128) -> i16 {
-    simd_extract(a.as_i16x8(), N)
+pub unsafe fn i16x8_extract_lane<const N: usize>(a: v128) -> i16 {
+    simd_extract(a.as_i16x8(), N as u32)
 }
 
 #[cfg(test)]
@@ -736,8 +747,8 @@ unsafe fn i16x8_extract_lane_u(a: v128) -> u32 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i16x8_replace_lane<const N: u32>(a: v128, val: i16) -> v128 {
-    transmute(simd_insert(a.as_i16x8(), N, val))
+pub unsafe fn i16x8_replace_lane<const N: usize>(a: v128, val: i16) -> v128 {
+    transmute(simd_insert(a.as_i16x8(), N as u32, val))
 }
 
 #[cfg(test)]
@@ -753,8 +764,8 @@ unsafe fn i16x8_replace_lane_test(a: v128, val: i16) -> v128 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i32x4_extract_lane<const N: u32>(a: v128) -> i32 {
-    simd_extract(a.as_i32x4(), N)
+pub unsafe fn i32x4_extract_lane<const N: usize>(a: v128) -> i32 {
+    simd_extract(a.as_i32x4(), N as u32)
 }
 
 #[cfg(test)]
@@ -770,8 +781,8 @@ unsafe fn i32x4_extract_lane_test(a: v128) -> i32 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i32x4_replace_lane<const N: u32>(a: v128, val: i32) -> v128 {
-    transmute(simd_insert(a.as_i32x4(), N, val))
+pub unsafe fn i32x4_replace_lane<const N: usize>(a: v128, val: i32) -> v128 {
+    transmute(simd_insert(a.as_i32x4(), N as u32, val))
 }
 
 #[cfg(test)]
@@ -787,8 +798,8 @@ unsafe fn i32x4_replace_lane_test(a: v128, val: i32) -> v128 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i64x2_extract_lane<const N: u32>(a: v128) -> i64 {
-    simd_extract(a.as_i64x2(), N)
+pub unsafe fn i64x2_extract_lane<const N: usize>(a: v128) -> i64 {
+    simd_extract(a.as_i64x2(), N as u32)
 }
 
 #[cfg(test)]
@@ -804,8 +815,8 @@ unsafe fn i64x2_extract_lane_test(a: v128) -> i64 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn i64x2_replace_lane<const N: u32>(a: v128, val: i64) -> v128 {
-    transmute(simd_insert(a.as_i64x2(), N, val))
+pub unsafe fn i64x2_replace_lane<const N: usize>(a: v128, val: i64) -> v128 {
+    transmute(simd_insert(a.as_i64x2(), N as u32, val))
 }
 
 #[cfg(test)]
@@ -821,8 +832,8 @@ unsafe fn i64x2_replace_lane_test(a: v128, val: i64) -> v128 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn f32x4_extract_lane<const N: u32>(a: v128) -> f32 {
-    simd_extract(a.as_f32x4(), N)
+pub unsafe fn f32x4_extract_lane<const N: usize>(a: v128) -> f32 {
+    simd_extract(a.as_f32x4(), N as u32)
 }
 
 #[cfg(test)]
@@ -838,8 +849,8 @@ unsafe fn f32x4_extract_lane_test(a: v128) -> f32 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn f32x4_replace_lane<const N: u32>(a: v128, val: f32) -> v128 {
-    transmute(simd_insert(a.as_f32x4(), N, val))
+pub unsafe fn f32x4_replace_lane<const N: usize>(a: v128, val: f32) -> v128 {
+    transmute(simd_insert(a.as_f32x4(), N as u32, val))
 }
 
 #[cfg(test)]
@@ -855,8 +866,8 @@ unsafe fn f32x4_replace_lane_test(a: v128, val: f32) -> v128 {
 /// `N` from `a`. If `N` fs out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn f64x2_extract_lane<const N: u32>(a: v128) -> f64 {
-    simd_extract(a.as_f64x2(), N)
+pub unsafe fn f64x2_extract_lane<const N: usize>(a: v128) -> f64 {
+    simd_extract(a.as_f64x2(), N as u32)
 }
 
 #[cfg(test)]
@@ -872,8 +883,8 @@ unsafe fn f64x2_extract_lane_test(a: v128) -> f64 {
 /// `N` from `a`. If `N` is out of bounds then it is a compile time error.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub unsafe fn f64x2_replace_lane<const N: u32>(a: v128, val: f64) -> v128 {
-    transmute(simd_insert(a.as_f64x2(), N, val))
+pub unsafe fn f64x2_replace_lane<const N: usize>(a: v128, val: f64) -> v128 {
+    transmute(simd_insert(a.as_f64x2(), N as u32, val))
 }
 
 #[cfg(test)]

From 2505f7c63aae0afa02e502cc00d19d207415a46e Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Fri, 17 Jul 2020 07:17:10 -0700
Subject: [PATCH 13/15] Pin wasmtime to 0.19.0

---
 ci/docker/wasm32-wasi/Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/docker/wasm32-wasi/Dockerfile b/ci/docker/wasm32-wasi/Dockerfile
index 0e151ebbb2..9bb4dc6f1d 100644
--- a/ci/docker/wasm32-wasi/Dockerfile
+++ b/ci/docker/wasm32-wasi/Dockerfile
@@ -7,8 +7,8 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends \
   xz-utils \
   clang
 
-RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/dev/wasmtime-dev-x86_64-linux.tar.xz | tar xJf -
-ENV PATH=$PATH:/wasmtime-dev-x86_64-linux
+RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/v0.19.0/wasmtime-v0.19.0-x86_64-linux.tar.xz | tar xJf -
+ENV PATH=$PATH:/wasmtime-v0.19.0-x86_64-linux
 
 ENV CARGO_TARGET_WASM32_WASI_RUNNER="wasmtime \
   --enable-simd \

From ae88f820b6e155e2a2d92d431d14da4fd8275e5c Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Fri, 17 Jul 2020 07:21:20 -0700
Subject: [PATCH 14/15] Switch `*_const` to non-const-generics

---
 crates/core_arch/src/wasm32/simd128.rs | 160 ++++++++++++-------------
 1 file changed, 78 insertions(+), 82 deletions(-)

diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 099d664854..4860bb49d1 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -334,60 +334,82 @@ unsafe fn test_v128_const() -> v128 {
 /// vector was interpreted as sixteen 8-bit integers.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub const unsafe fn i8x16_const<
-    const A0: i8,
-    const A1: i8,
-    const A2: i8,
-    const A3: i8,
-    const A4: i8,
-    const A5: i8,
-    const A6: i8,
-    const A7: i8,
-    const A8: i8,
-    const A9: i8,
-    const A10: i8,
-    const A11: i8,
-    const A12: i8,
-    const A13: i8,
-    const A14: i8,
-    const A15: i8,
->() -> v128 {
+#[cfg_attr(
+    all(test, all_simd),
+    assert_instr(
+        v128.const,
+        a0 = 0,
+        a1 = 1,
+        a2 = 2,
+        a3 = 3,
+        a4 = 4,
+        a5 = 5,
+        a6 = 6,
+        a7 = 7,
+        a8 = 8,
+        a9 = 9,
+        a10 = 10,
+        a11 = 11,
+        a12 = 12,
+        a13 = 13,
+        a14 = 14,
+        a15 = 15,
+    )
+)]
+pub const unsafe fn i8x16_const(
+    a0: i8,
+    a1: i8,
+    a2: i8,
+    a3: i8,
+    a4: i8,
+    a5: i8,
+    a6: i8,
+    a7: i8,
+    a8: i8,
+    a9: i8,
+    a10: i8,
+    a11: i8,
+    a12: i8,
+    a13: i8,
+    a14: i8,
+    a15: i8,
+) -> v128 {
     transmute(i8x16(
-        A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15,
+        a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15,
     ))
 }
 
-#[cfg(all(test, all_simd))]
-#[assert_instr(v128.const)]
-#[target_feature(enable = "simd128")]
-unsafe fn test_i8x16_const() -> v128 {
-    i8x16_const::<0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>()
-}
-
 /// Materializes a constant SIMD value from the immediate operands.
 ///
 /// This function generates a `v128.const` instruction as if the generated
 /// vector was interpreted as eight 16-bit integers.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub const unsafe fn i16x8_const<
-    const A0: i16,
-    const A1: i16,
-    const A2: i16,
-    const A3: i16,
-    const A4: i16,
-    const A5: i16,
-    const A6: i16,
-    const A7: i16,
->() -> v128 {
-    transmute(i16x8(A0, A1, A2, A3, A4, A5, A6, A7))
-}
-
-#[cfg(all(test, all_simd))]
-#[assert_instr(v128.const)]
-#[target_feature(enable = "simd128")]
-unsafe fn test_i16x8_const() -> v128 {
-    i16x8_const::<0, 0, 0, 0, 1, 0, 0, 0>()
+#[cfg_attr(
+    all(test, all_simd),
+    assert_instr(
+        v128.const,
+        a0 = 0,
+        a1 = 1,
+        a2 = 2,
+        a3 = 3,
+        a4 = 4,
+        a5 = 5,
+        a6 = 6,
+        a7 = 7,
+    )
+)]
+pub const unsafe fn i16x8_const(
+    a0: i16,
+    a1: i16,
+    a2: i16,
+    a3: i16,
+    a4: i16,
+    a5: i16,
+    a6: i16,
+    a7: i16,
+) -> v128 {
+    transmute(i16x8(a0, a1, a2, a3, a4, a5, a6, a7))
 }
 
 /// Materializes a constant SIMD value from the immediate operands.
@@ -396,16 +418,9 @@ unsafe fn test_i16x8_const() -> v128 {
 /// vector was interpreted as four 32-bit integers.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub const unsafe fn i32x4_const<const A0: i32, const A1: i32, const A2: i32, const A3: i32>() -> v128
-{
-    transmute(i32x4(A0, A1, A2, A3))
-}
-
-#[cfg(all(test, all_simd))]
-#[assert_instr(v128.const)]
-#[target_feature(enable = "simd128")]
-unsafe fn test_i32x4_const() -> v128 {
-    i32x4_const::<0, 0, 0, 1>()
+#[cfg_attr(all(test, all_simd), assert_instr(v128.const, a0 = 0, a1 = 1, a2 = 2, a3 = 3))]
+pub const unsafe fn i32x4_const(a0: i32, a1: i32, a2: i32, a3: i32) -> v128 {
+    transmute(i32x4(a0, a1, a2, a3))
 }
 
 /// Materializes a constant SIMD value from the immediate operands.
@@ -414,15 +429,9 @@ unsafe fn test_i32x4_const() -> v128 {
 /// vector was interpreted as two 64-bit integers.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub const unsafe fn i64x2_const<const A0: i64, const A1: i64>() -> v128 {
-    transmute(i64x2(A0, A1))
-}
-
-#[cfg(all(test, all_simd))]
-#[assert_instr(v128.const)]
-#[target_feature(enable = "simd128")]
-unsafe fn test_i64x2_const() -> v128 {
-    i64x2_const::<0, 2>()
+#[cfg_attr(all(test, all_simd), assert_instr(v128.const, a0 = 0, a1 = 1))]
+pub const unsafe fn i64x2_const(a0: i64, a1: i64) -> v128 {
+    transmute(i64x2(a0, a1))
 }
 
 /// Materializes a constant SIMD value from the immediate operands.
@@ -431,16 +440,9 @@ unsafe fn test_i64x2_const() -> v128 {
 /// vector was interpreted as four 32-bit floats.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub const unsafe fn f32x4_const<const A0: f32, const A1: f32, const A2: f32, const A3: f32>() -> v128
-{
-    transmute(f32x4(A0, A1, A2, A3))
-}
-
-#[cfg(all(test, all_simd))]
-#[assert_instr(v128.const)]
-#[target_feature(enable = "simd128")]
-unsafe fn test_f32x4_const() -> v128 {
-    f32x4_const::<0.0, 1.0, 0.0, 0.0>()
+#[cfg_attr(all(test, all_simd), assert_instr(v128.const, a0 = 0.0, a1 = 1.0, a2 = 2.0, a3 = 3.0))]
+pub const unsafe fn f32x4_const(a0: f32, a1: f32, a2: f32, a3: f32) -> v128 {
+    transmute(f32x4(a0, a1, a2, a3))
 }
 
 /// Materializes a constant SIMD value from the immediate operands.
@@ -449,15 +451,9 @@ unsafe fn test_f32x4_const() -> v128 {
 /// vector was interpreted as two 64-bit floats.
 #[inline]
 #[target_feature(enable = "simd128")]
-pub const unsafe fn f64x2_const<const A0: f64, const A1: f64>() -> v128 {
-    transmute(f64x2(A0, A1))
-}
-
-#[cfg(all(test, all_simd))]
-#[assert_instr(v128.const)]
-#[target_feature(enable = "simd128")]
-unsafe fn test_f64x2_const() -> v128 {
-    f64x2_const::<0.0, 1.0>()
+#[cfg_attr(all(test, all_simd), assert_instr(v128.const, a0 = 0.0, a1 = 1.0))]
+pub const unsafe fn f64x2_const(a0: f64, a1: f64) -> v128 {
+    transmute(f64x2(a0, a1))
 }
 
 /// Returns a new vector with lanes selected from the lanes of the two input

From 6ada7ba77c6293ff575455687667eaf1f6226f8a Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Fri, 17 Jul 2020 07:22:05 -0700
Subject: [PATCH 15/15] Remove v128_const

---
 crates/core_arch/src/wasm32/simd128.rs | 38 +-------------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 4860bb49d1..798035d76a 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -292,42 +292,6 @@ pub unsafe fn v128_store(m: *mut v128, a: v128) {
     *m = a;
 }
 
-/// Materializes a constant SIMD value from the immediate operands.
-///
-/// The `v128.const` instruction is encoded with 16 immediate bytes
-/// which provide the bits of the vector directly.
-#[inline]
-#[target_feature(enable = "simd128")]
-pub const unsafe fn v128_const<
-    const A0: u8,
-    const A1: u8,
-    const A2: u8,
-    const A3: u8,
-    const A4: u8,
-    const A5: u8,
-    const A6: u8,
-    const A7: u8,
-    const A8: u8,
-    const A9: u8,
-    const A10: u8,
-    const A11: u8,
-    const A12: u8,
-    const A13: u8,
-    const A14: u8,
-    const A15: u8,
->() -> v128 {
-    transmute(u8x16(
-        A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15,
-    ))
-}
-
-#[cfg(all(test, all_simd))]
-#[assert_instr(v128.const)]
-#[target_feature(enable = "simd128")]
-unsafe fn test_v128_const() -> v128 {
-    v128_const::<0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>()
-}
-
 /// Materializes a constant SIMD value from the immediate operands.
 ///
 /// This function generates a `v128.const` instruction as if the generated
@@ -2382,7 +2346,7 @@ pub mod tests {
     #[test]
     fn test_v128_const() {
         const A: v128 =
-            unsafe { super::v128_const::<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15>() };
+            unsafe { super::i8x16_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) };
         compare_bytes(A, A);
     }