From 0b5db6678f6e0d8d1035d235b26d19ffcbd331f5 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Thu, 4 Jun 2020 08:22:14 -0700
Subject: [PATCH] Update and revamp wasm32 SIMD intrinsics

Lots of time and lots of things have happened since the simd128 support
was first added to this crate. Things are starting to settle down now so
this commit syncs the Rust intrinsic definitions with the current
specification (https://github.com/WebAssembly/simd). Unfortuantely not
everything can be enabled just yet but everything is in the pipeline for
getting enabled soon.

This commit also applies a major revamp to how intrinsics are tested.
The intention is that the setup should be much more lightweight and/or
easy to work with after this commit.

At a high-level, the changes here are:

* Testing with node.js and `#[wasm_bindgen]` has been removed. Instead
  intrinsics are tested with Wasmtime which has a nearly complete
  implementation of the SIMD spec (and soon fully complete!)

* Testing is switched to `wasm32-wasi` to make idiomatic Rust bits a bit
  easier to work with (e.g. `panic!)`

* Testing of this crate's simd128 feature for wasm is re-enabled. This
  will run on CI and both compile and execute intrinsics. This should
  bring wasm intrinsics to the same level of parity as x86 intrinsics,
  for example.

* New wasm intrinsics have been added:
  * `iNNxMM_loadAxA_{s,u}`
  * `vNNxMM_load_splat`
  * `v8x16_swizzle`
  * `v128_andnot`
  * `iNNxMM_abs`
  * `iNNxMM_narrow_*_{u,s}`
  * `iNNxMM_bitmask` - commented out until LLVM is updated to LLVM 11
  * `iNNxMM_widen_*_{u,s}` - commented out until
    bytecodealliance/wasmtime#1994 lands
  * `iNNxMM_{max,min}_{u,s}`
  * `iNNxMM_avgr_u`

* Some wasm intrinsics have been removed:
  * `i64x2_trunc_*`
  * `f64x2_convert_*`
  * `i8x16_mul`

* The `v8x16.shuffle` instruction is exposed. This is done through a
  `macro` (not `macro_rules!`, but `macro`). This is intended to be
  somewhat experimental and unstable until we decide otherwise. This
  instruction has 16 immediate-mode expressions and is as a result
  unsuited to the existing `constify_*` logic of this crate. I'm hoping
  that we can game out over time what a macro might look like and/or
  look for better solutions. For now, though, what's implemented is the
  first of its kind in this crate (an architecture-specific macro), so
  some extra scrutiny looking at it would be appreciated.

* Lots of `assert_instr` annotations have been fixed for wasm.

* All wasm simd128 tests are uncommented and passing now.

This is still missing tests for new intrinsics and it's also missing
tests for various corner cases. I hope to get to those later as the
upstream spec itself gets closer to stabilization.

In the meantime, however, I went ahead and updated the `hex.rs` example
with a wasm implementation using intrinsics. With it I got some very
impressive speedups using Wasmtime:

    test benches::large_default  ... bench:     213,961 ns/iter (+/- 5,108) = 4900 MB/s
    test benches::large_fallback ... bench:   3,108,434 ns/iter (+/- 75,730) = 337 MB/s
    test benches::small_default  ... bench:          52 ns/iter (+/- 0) = 2250 MB/s
    test benches::small_fallback ... bench:         358 ns/iter (+/- 0) = 326 MB/s

or otherwise using Wasmtime hex encoding using SIMD is 15x faster on 1MB
chunks or 7x faster on small <128byte chunks.

All of these intrinsics are still unstable and will continue to be so
presumably until the simd proposal in wasm itself progresses to a later
stage. Additionaly we'll still want to sync with clang on intrinsic
names (or decide not to) at some point in the future.
---
 .github/workflows/main.yml                    |    4 +-
 ci/docker/wasm32-unknown-unknown/Dockerfile   |   25 -
 .../wasm32-unknown-unknown/wasm-entrypoint.sh |   15 -
 ci/docker/wasm32-wasi/Dockerfile              |   22 +
 ci/run.sh                                     |   31 +-
 crates/assert-instr-macro/src/lib.rs          |    8 +-
 crates/core_arch/Cargo.toml                   |    3 -
 crates/core_arch/src/lib.rs                   |   10 +-
 crates/core_arch/src/mod.rs                   |    2 +
 crates/core_arch/src/wasm32/atomic.rs         |    2 -
 crates/core_arch/src/wasm32/memory.rs         |    2 -
 crates/core_arch/src/wasm32/mod.rs            |    4 +-
 crates/core_arch/src/wasm32/simd128.rs        | 1691 ++++++++++-------
 crates/core_arch/tests/xcrate-macros.rs       |   18 +
 crates/std_detect/src/detect/mod.rs           |    2 +
 crates/std_detect/src/detect/os/other.rs      |    1 +
 crates/stdarch-test/Cargo.toml                |    6 +-
 crates/stdarch-test/src/lib.rs                |   12 -
 crates/stdarch-test/src/wasm.rs               |   48 +-
 examples/Cargo.toml                           |    6 +-
 examples/hex.rs                               |   85 +-
 examples/wasm.rs                              |    4 +-
 22 files changed, 1204 insertions(+), 797 deletions(-)
 delete mode 100644 ci/docker/wasm32-unknown-unknown/Dockerfile
 delete mode 100755 ci/docker/wasm32-unknown-unknown/wasm-entrypoint.sh
 create mode 100644 ci/docker/wasm32-wasi/Dockerfile
 create mode 100644 crates/core_arch/tests/xcrate-macros.rs

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c5ef15004c..cadfc38300 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -77,7 +77,7 @@ jobs:
         - mips64-unknown-linux-gnuabi64
         - mips64el-unknown-linux-gnuabi64
         - s390x-unknown-linux-gnu
-        - wasm32-unknown-unknown
+        - wasm32-wasi
         - i586-unknown-linux-gnu
         - x86_64-linux-android
         - arm-linux-androideabi
@@ -129,7 +129,7 @@ jobs:
           disable_assert_instr: true
         - target: s390x-unknown-linux-gnu
           os: ubuntu-latest
-        - target: wasm32-unknown-unknown
+        - target: wasm32-wasi
           os: ubuntu-latest
         - target: aarch64-unknown-linux-gnu
           os: ubuntu-latest
diff --git a/ci/docker/wasm32-unknown-unknown/Dockerfile b/ci/docker/wasm32-unknown-unknown/Dockerfile
deleted file mode 100644
index 7b2567bcc7..0000000000
--- a/ci/docker/wasm32-unknown-unknown/Dockerfile
+++ /dev/null
@@ -1,25 +0,0 @@
-FROM ubuntu:18.04
-
-RUN apt-get update -y && apt-get install -y --no-install-recommends \
-  ca-certificates \
-  clang \
-  cmake \
-  curl \
-  git \
-  libc6-dev \
-  make \
-  python \
-  python3 \
-  xz-utils
-
-# Install `wasm2wat`
-RUN git clone --recursive https://github.com/WebAssembly/wabt
-RUN make -C wabt -j$(nproc)
-ENV PATH=$PATH:/wabt/bin
-
-# Install `node`
-RUN curl https://nodejs.org/dist/v12.0.0/node-v12.0.0-linux-x64.tar.xz | tar xJf -
-ENV PATH=$PATH:/node-v12.0.0-linux-x64/bin
-
-COPY docker/wasm32-unknown-unknown/wasm-entrypoint.sh /wasm-entrypoint.sh
-ENTRYPOINT ["/wasm-entrypoint.sh"]
diff --git a/ci/docker/wasm32-unknown-unknown/wasm-entrypoint.sh b/ci/docker/wasm32-unknown-unknown/wasm-entrypoint.sh
deleted file mode 100755
index 9916d1cb22..0000000000
--- a/ci/docker/wasm32-unknown-unknown/wasm-entrypoint.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-set -e
-
-# Download an appropriate version of wasm-bindgen based off of what's being used
-# in the lock file. Ideally we'd use `wasm-pack` at some point for this!
-version=$(grep -A 1 'name = "wasm-bindgen"' Cargo.lock | grep version)
-version=$(echo $version | awk '{print $3}' | sed 's/"//g')
-curl -L https://github.com/rustwasm/wasm-bindgen/releases/download/$version/wasm-bindgen-$version-x86_64-unknown-linux-musl.tar.gz \
-   | tar xzf - -C target
-export PATH=$PATH:`pwd`/target/wasm-bindgen-$version-x86_64-unknown-linux-musl
-export CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=wasm-bindgen-test-runner
-export NODE_ARGS=--experimental-wasm-simd
-
-exec "$@"
diff --git a/ci/docker/wasm32-wasi/Dockerfile b/ci/docker/wasm32-wasi/Dockerfile
new file mode 100644
index 0000000000..7b7d75190c
--- /dev/null
+++ b/ci/docker/wasm32-wasi/Dockerfile
@@ -0,0 +1,22 @@
+FROM ubuntu:20.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update -y && apt-get install -y --no-install-recommends \
+  ca-certificates \
+  clang \
+  cmake \
+  curl \
+  git \
+  libc6-dev \
+  make \
+  python \
+  python3 \
+  xz-utils
+
+RUN curl -L https://github.com/bytecodealliance/wasmtime/releases/download/dev/wasmtime-dev-x86_64-linux.tar.xz | tar xJf -
+ENV PATH=$PATH:/wasmtime-dev-x86_64-linux
+
+ENV CARGO_TARGET_WASM32_WASI_RUNNER="wasmtime \
+  --enable-simd \
+  --mapdir .::/checkout/target/wasm32-wasi/release/deps \
+  --"
diff --git a/ci/run.sh b/ci/run.sh
index 682a38636c..2b7e51be3d 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -44,6 +44,16 @@ cargo_test() {
     fi
     cmd="$cmd ${subcmd} --target=$TARGET $1"
     cmd="$cmd -- $2"
+
+    # wasm targets can't catch panics so if a test failures make sure the test
+    # harness isn't trying to capture output, otherwise we won't get any useful
+    # output.
+    case ${TARGET} in
+        wasm32*)
+            cmd="$cmd --nocapture"
+            ;;
+    esac
+
     $cmd
 }
 
@@ -72,20 +82,11 @@ case ${TARGET} in
         export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
         cargo_test "--release"
         ;;
-    wasm32-unknown-unknown*)
-        # Attempt to actually run some SIMD tests in node.js. Unfortunately
-        # though node.js (transitively through v8) doesn't have support for the
-        # full SIMD spec yet, only some functions. As a result only pass in
-        # some target features and a special `--cfg`
-        # FIXME: broken
-        #export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+simd128 --cfg only_node_compatible_functions"
-        #cargo_test "--release"
-
-        # After that passes make sure that all intrinsics compile, passing in
-        # the extra feature to compile in non-node-compatible SIMD.
-        # FIXME: broken
-        #export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+simd128,+unimplemented-simd128"
-        #cargo_test "--release --no-run"
+    wasm32*)
+        prev="$RUSTFLAGS"
+        export RUSTFLAGS="${RUSTFLAGS} -C target-feature=+simd128,+unimplemented-simd128"
+        cargo_test "--release"
+        export RUSTFLAGS="$prev"
         ;;
     # FIXME: don't build anymore
     #mips-*gnu* | mipsel-*gnu*)
@@ -111,7 +112,7 @@ case ${TARGET} in
 
 esac
 
-if [ "$NORUN" != "1" ] && [ "$NOSTD" != 1 ] && [ "$TARGET" != "wasm32-unknown-unknown" ]; then
+if [ "$NORUN" != "1" ] && [ "$NOSTD" != 1 ]; then
     # Test examples
     (
         cd examples
diff --git a/crates/assert-instr-macro/src/lib.rs b/crates/assert-instr-macro/src/lib.rs
index 75fe9851ca..200f02fae5 100644
--- a/crates/assert-instr-macro/src/lib.rs
+++ b/crates/assert-instr-macro/src/lib.rs
@@ -131,8 +131,7 @@ pub fn assert_instr(
     };
 
     let tokens: TokenStream = quote! {
-        #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
-        #[cfg_attr(not(target_arch = "wasm32"), test)]
+        #[test]
         #[allow(non_snake_case)]
         fn #assert_name() {
             #to_test
@@ -146,11 +145,6 @@ pub fn assert_instr(
                                    #instr);
         }
     };
-    // why? necessary now to get tests to work?
-    let tokens: TokenStream = tokens
-        .to_string()
-        .parse()
-        .expect("cannot parse tokenstream");
 
     let tokens: TokenStream = quote! {
         #item
diff --git a/crates/core_arch/Cargo.toml b/crates/core_arch/Cargo.toml
index 72d89b0168..a25b20bf0c 100644
--- a/crates/core_arch/Cargo.toml
+++ b/crates/core_arch/Cargo.toml
@@ -26,8 +26,5 @@ maintenance = { status = "experimental" }
 stdarch-test = { version = "0.*", path = "../stdarch-test" }
 std_detect = { version = "0.*", path = "../std_detect" }
 
-[target.wasm32-unknown-unknown.dev-dependencies]
-wasm-bindgen-test = "0.2.47"
-
 [package.metadata.docs.rs]
 rustdoc-args = [ "--cfg", "dox" ]
diff --git a/crates/core_arch/src/lib.rs b/crates/core_arch/src/lib.rs
index fae4519a0e..cf59f9680c 100644
--- a/crates/core_arch/src/lib.rs
+++ b/crates/core_arch/src/lib.rs
@@ -32,7 +32,10 @@
     adx_target_feature,
     rtm_target_feature,
     f16c_target_feature,
-    external_doc
+    external_doc,
+    allow_internal_unstable,
+    allow_internal_unsafe,
+    decl_macro
 )]
 #![cfg_attr(test, feature(test, abi_vectorcall, untagged_unions))]
 #![deny(clippy::missing_inline_in_public_items)]
@@ -66,13 +69,10 @@ extern crate std_detect;
 #[cfg(test)]
 extern crate stdarch_test;
 
-#[cfg(all(test, target_arch = "wasm32"))]
-extern crate wasm_bindgen_test;
-
 #[path = "mod.rs"]
 mod core_arch;
 
-pub use self::core_arch::arch::*;
+pub use self::core_arch::arch;
 
 #[allow(unused_imports)]
 use core::{ffi, hint, intrinsics, marker, mem, ops, ptr, sync};
diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs
index 4ed18d7648..19f61affdd 100644
--- a/crates/core_arch/src/mod.rs
+++ b/crates/core_arch/src/mod.rs
@@ -1,5 +1,7 @@
 //! `core_arch`
 
+#![allow(improper_ctypes_definitions)]
+
 #[macro_use]
 mod macros;
 
diff --git a/crates/core_arch/src/wasm32/atomic.rs b/crates/core_arch/src/wasm32/atomic.rs
index b8ffaeac0e..024bf94a7f 100644
--- a/crates/core_arch/src/wasm32/atomic.rs
+++ b/crates/core_arch/src/wasm32/atomic.rs
@@ -10,8 +10,6 @@
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
-#[cfg(test)]
-use wasm_bindgen_test::wasm_bindgen_test;
 
 extern "C" {
     #[link_name = "llvm.wasm.atomic.wait.i32"]
diff --git a/crates/core_arch/src/wasm32/memory.rs b/crates/core_arch/src/wasm32/memory.rs
index 3df8abdee2..c4e801b738 100644
--- a/crates/core_arch/src/wasm32/memory.rs
+++ b/crates/core_arch/src/wasm32/memory.rs
@@ -1,7 +1,5 @@
 #[cfg(test)]
 use stdarch_test::assert_instr;
-#[cfg(test)]
-use wasm_bindgen_test::wasm_bindgen_test;
 
 extern "C" {
     #[link_name = "llvm.wasm.memory.grow.i32"]
diff --git a/crates/core_arch/src/wasm32/mod.rs b/crates/core_arch/src/wasm32/mod.rs
index 5e7a9d85f4..1bbce97619 100644
--- a/crates/core_arch/src/wasm32/mod.rs
+++ b/crates/core_arch/src/wasm32/mod.rs
@@ -2,8 +2,6 @@
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
-#[cfg(test)]
-use wasm_bindgen_test::wasm_bindgen_test;
 
 #[cfg(any(target_feature = "atomics", dox))]
 mod atomic;
@@ -12,6 +10,8 @@ pub use self::atomic::*;
 
 #[cfg(any(target_feature = "simd128", dox))]
 mod simd128;
+// #[cfg(any(target_feature = "simd128", dox))]
+// pub use self::simd128::v128_shuffle;
 #[cfg(any(target_feature = "simd128", dox))]
 pub use self::simd128::*;
 
diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 5ac01a4fae..0b897d9bc2 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -4,6 +4,7 @@
 //! https://github.com/WebAssembly/simd/blob/master/proposals/simd/SIMD.md
 
 #![allow(non_camel_case_types)]
+#![allow(unused_imports)]
 
 use crate::{
     core_arch::{simd::*, simd_llvm::*},
@@ -14,8 +15,6 @@ use crate::{
 
 #[cfg(test)]
 use stdarch_test::assert_instr;
-#[cfg(test)]
-use wasm_bindgen_test::wasm_bindgen_test;
 
 types! {
     /// WASM-specific 128-bit wide SIMD vector type.
@@ -119,11 +118,6 @@ extern "C" {
     #[link_name = "llvm.wasm.alltrue.v4i32"]
     fn llvm_i32x4_all_true(x: i32x4) -> i32;
 
-    #[link_name = "llvm.wasm.anytrue.v2i64"]
-    fn llvm_i64x2_any_true(x: i64x2) -> i32;
-    #[link_name = "llvm.wasm.alltrue.v2i64"]
-    fn llvm_i64x2_all_true(x: i64x2) -> i32;
-
     #[link_name = "llvm.fabs.v4f32"]
     fn llvm_f32x4_abs(x: f32x4) -> f32x4;
     #[link_name = "llvm.sqrt.v4f32"]
@@ -143,6 +137,47 @@ extern "C" {
 
     #[link_name = "llvm.wasm.bitselect.v16i8"]
     fn llvm_bitselect(a: i8x16, b: i8x16, c: i8x16) -> i8x16;
+    #[link_name = "llvm.wasm.swizzle"]
+    fn llvm_swizzle(a: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.wasm.bitmask.v16i8"]
+    fn llvm_bitmask_i8x16(a: i8x16) -> i32;
+    #[link_name = "llvm.wasm.narrow.signed.v16i8.v8i16"]
+    fn llvm_narrow_i8x16_s(a: i16x8, b: i16x8) -> i8x16;
+    #[link_name = "llvm.wasm.narrow.unsigned.v16i8.v8i16"]
+    fn llvm_narrow_i8x16_u(a: i16x8, b: i16x8) -> i8x16;
+    #[link_name = "llvm.wasm.avgr.unsigned.v16i8"]
+    fn llvm_avgr_u_i8x16(a: i8x16, b: i8x16) -> i8x16;
+
+    #[link_name = "llvm.wasm.bitmask.v8i16"]
+    fn llvm_bitmask_i16x8(a: i16x8) -> i32;
+    #[link_name = "llvm.wasm.narrow.signed.v8i16.v8i16"]
+    fn llvm_narrow_i16x8_s(a: i32x4, b: i32x4) -> i16x8;
+    #[link_name = "llvm.wasm.narrow.unsigned.v8i16.v8i16"]
+    fn llvm_narrow_i16x8_u(a: i32x4, b: i32x4) -> i16x8;
+    #[link_name = "llvm.wasm.avgr.unsigned.v8i16"]
+    fn llvm_avgr_u_i16x8(a: i16x8, b: i16x8) -> i16x8;
+    #[link_name = "llvm.wasm.widen.low.signed.v8i16.v16i8"]
+    fn llvm_widen_low_i16x8_s(a: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.widen.high.signed.v8i16.v16i8"]
+    fn llvm_widen_high_i16x8_s(a: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.widen.low.unsigned.v8i16.v16i8"]
+    fn llvm_widen_low_i16x8_u(a: i8x16) -> i16x8;
+    #[link_name = "llvm.wasm.widen.high.unsigned.v8i16.v16i8"]
+    fn llvm_widen_high_i16x8_u(a: i8x16) -> i16x8;
+
+    #[link_name = "llvm.wasm.bitmask.v4i32"]
+    fn llvm_bitmask_i32x4(a: i32x4) -> i32;
+    #[link_name = "llvm.wasm.avgr.unsigned.v4i32"]
+    fn llvm_avgr_u_i32x4(a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.wasm.widen.low.signed.v4i32.v8i16"]
+    fn llvm_widen_low_i32x4_s(a: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.widen.high.signed.v4i32.v8i16"]
+    fn llvm_widen_high_i32x4_s(a: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.widen.low.unsigned.v4i32.v8i16"]
+    fn llvm_widen_low_i32x4_u(a: i16x8) -> i32x4;
+    #[link_name = "llvm.wasm.widen.high.unsigned.v4i32.v8i16"]
+    fn llvm_widen_high_i32x4_u(a: i16x8) -> i32x4;
 }
 
 /// Loads a `v128` vector from the given heap address.
@@ -152,6 +187,80 @@ pub unsafe fn v128_load(m: *const v128) -> v128 {
     ptr::read(m)
 }
 
+/// Load eight 8-bit integers and sign extend each one to a 16-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.load8x8_s))]
+pub unsafe fn i16x8_load8x8_s(m: *const i8) -> v128 {
+    transmute(simd_cast::<_, i16x8>(ptr::read(m as *const i8x8)))
+}
+
+/// Load eight 8-bit integers and zero extend each one to a 16-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.load8x8_u))]
+pub unsafe fn i16x8_load8x8_u(m: *const u8) -> v128 {
+    transmute(simd_cast::<_, u16x8>(ptr::read(m as *const u8x8)))
+}
+
+/// Load four 16-bit integers and sign extend each one to a 32-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.load16x4_s))]
+pub unsafe fn i32x4_load16x4_s(m: *const i16) -> v128 {
+    transmute(simd_cast::<_, i32x4>(ptr::read(m as *const i16x4)))
+}
+
+/// Load four 16-bit integers and zero extend each one to a 32-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.load16x4_u))]
+pub unsafe fn i32x4_load16x4_u(m: *const u16) -> v128 {
+    transmute(simd_cast::<_, u32x4>(ptr::read(m as *const u16x4)))
+}
+
+/// Load two 32-bit integers and sign extend each one to a 64-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.load32x2_s))]
+pub unsafe fn i64x2_load32x2_s(m: *const i32) -> v128 {
+    transmute(simd_cast::<_, i64x2>(ptr::read(m as *const i32x2)))
+}
+
+/// Load two 32-bit integers and zero extend each one to a 64-bit lane
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.load32x2_u))]
+pub unsafe fn i64x2_load32x2_u(m: *const u32) -> v128 {
+    transmute(simd_cast::<_, u64x2>(ptr::read(m as *const u32x2)))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v8x16.load_splat))]
+pub unsafe fn v8x16_load_splat(m: *const u8) -> v128 {
+    let v = *m;
+    transmute(u8x16(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v16x8.load_splat))]
+pub unsafe fn v16x8_load_splat(m: *const u16) -> v128 {
+    let v = *m;
+    transmute(u16x8(v, v, v, v, v, v, v, v))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v32x4.load_splat))]
+pub unsafe fn v32x4_load_splat(m: *const u32) -> v128 {
+    let v = *m;
+    transmute(u32x4(v, v, v, v))
+}
+
+/// Load a single element and splat to all lanes of a v128 vector.
+#[inline]
+#[cfg_attr(test, assert_instr(v64x2.load_splat))]
+pub unsafe fn v64x2_load_splat(m: *const u64) -> v128 {
+    let v = *m;
+    transmute(u64x2(v, v))
+}
+
 /// Stores a `v128` vector to the given heap address.
 #[inline]
 #[cfg_attr(test, assert_instr(v128.store))]
@@ -164,7 +273,6 @@ pub unsafe fn v128_store(m: *mut v128, a: v128) {
 /// The `v128.const` instruction is encoded with 16 immediate bytes
 /// `imm` which provide the bits of the vector directly.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[rustc_args_required_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)]
 #[cfg_attr(test, assert_instr(
     v128.const,
@@ -217,6 +325,104 @@ pub const fn v128_const(
     }
 }
 
+/// Returns a new vector with lanes selected from the lanes of the two input
+/// vectors `$a` and `$b` specified in the 16 immediate operands.
+///
+/// The `$a` and `$b` expressions must have type `v128`, and this macro
+/// generates a wasm instruction that is encoded with 16 bytes providing the
+/// indices of the elements to return. The indices `i` in range [0, 15] select
+/// the `i`-th element of `a`. The indices in range [16, 31] select the `i -
+/// 16`-th element of `b`.
+///
+/// Note that this is a macro due to the codegen requirements of all of the
+/// index expressions `$i*` must be constant. A compiler error will be
+/// generated if any of the expressions are not constant.
+///
+/// All indexes `$i*` must have the type `u32`.
+#[allow_internal_unstable(platform_intrinsics, rustc_attrs, repr_simd)]
+#[allow_internal_unsafe]
+pub macro v8x16_shuffle(
+    $a:expr, $b:expr,
+    $i0:expr,
+    $i1:expr,
+    $i2:expr,
+    $i3:expr,
+    $i4:expr,
+    $i5:expr,
+    $i6:expr,
+    $i7:expr,
+    $i8:expr,
+    $i9:expr,
+    $i10:expr,
+    $i11:expr,
+    $i12:expr,
+    $i13:expr,
+    $i14:expr,
+    $i15:expr $(,)?
+) {{
+    extern "platform-intrinsic" {
+        #[rustc_args_required_const(2)]
+        pub fn simd_shuffle16<T, U>(x: T, y: T, idx: [u32; 16]) -> U;
+    }
+    let shuf = simd_shuffle16::<
+        $crate::arch::wasm32::__v8x16_shuffle_u8x16,
+        $crate::arch::wasm32::__v8x16_shuffle_u8x16,
+    >(
+        $crate::arch::wasm32::__v8x16_shuffle_transmute::<
+            $crate::arch::wasm32::v128,
+            $crate::arch::wasm32::__v8x16_shuffle_u8x16,
+        >($a),
+        $crate::arch::wasm32::__v8x16_shuffle_transmute::<
+            $crate::arch::wasm32::v128,
+            $crate::arch::wasm32::__v8x16_shuffle_u8x16,
+        >($b),
+        [
+            $i0, $i1, $i2, $i3, $i4, $i5, $i6, $i7, $i8, $i9, $i10, $i11, $i12, $i13, $i14, $i15,
+        ],
+    );
+    $crate::arch::wasm32::__v8x16_shuffle_transmute::<
+        $crate::arch::wasm32::__v8x16_shuffle_u8x16,
+        $crate::arch::wasm32::v128,
+    >(shuf)
+}}
+
+// internal implementation detail of the `v8x16_shuffle`, done so there's a name
+// that always works for the macro to import.
+pub use crate::mem::transmute as __v8x16_shuffle_transmute;
+
+// internal to this module and only generated here as an implementation detail
+// of the `v8x16_shuffle` macro.
+#[repr(simd)]
+pub struct __v8x16_shuffle_u8x16(
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+    u8,
+);
+
+/// Returns a new vector with lanes selected from the lanes of the first input
+/// vector `a` specified in the second input vector `s`.
+///
+/// The indices `i` in range [0, 15] select the `i`-th element of `a`. For
+/// indices outside of the range the resulting lane is 0.
+#[inline]
+#[cfg_attr(test, assert_instr(v8x16.swizzle))]
+pub fn v8x16_swizzle(a: v128, s: v128) -> v128 {
+    unsafe { transmute(llvm_swizzle(transmute(a), transmute(s))) }
+}
+
 /// Creates a vector with identical lanes.
 ///
 /// Constructs a vector with `x` replicated to all 16 lanes.
@@ -226,6 +432,51 @@ pub fn i8x16_splat(a: i8) -> v128 {
     unsafe { transmute(i8x16::splat(a)) }
 }
 
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 8 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.splat))]
+pub fn i16x8_splat(a: i16) -> v128 {
+    unsafe { transmute(i16x8::splat(a)) }
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.splat))]
+pub fn i32x4_splat(a: i32) -> v128 {
+    unsafe { transmute(i32x4::splat(a)) }
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Construct a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.splat))]
+pub fn i64x2_splat(a: i64) -> v128 {
+    unsafe { transmute(i64x2::splat(a)) }
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 4 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(f32x4.splat))]
+pub fn f32x4_splat(a: f32) -> v128 {
+    unsafe { transmute(f32x4::splat(a)) }
+}
+
+/// Creates a vector with identical lanes.
+///
+/// Constructs a vector with `x` replicated to all 2 lanes.
+#[inline]
+#[cfg_attr(test, assert_instr(f64x2.splat))]
+pub fn f64x2_splat(a: f64) -> v128 {
+    unsafe { transmute(f64x2::splat(a)) }
+}
+
 /// Extracts a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
@@ -238,20 +489,21 @@ pub fn i8x16_splat(a: i8) -> v128 {
 #[inline]
 #[rustc_args_required_const(1)]
 pub unsafe fn i8x16_extract_lane(a: v128, imm: usize) -> i8 {
-    #[cfg(test)]
-    #[assert_instr(i8x16.extract_lane_s)]
-    fn extract_lane_s(a: v128) -> i32 {
-        unsafe { i8x16_extract_lane(a, 0) as i32 }
-    }
-    #[cfg(test)]
-    #[cfg(not(only_node_compatible_functions))]
-    #[assert_instr(i8x16.extract_lane_u)]
-    fn extract_lane_u(a: v128) -> u32 {
-        unsafe { i8x16_extract_lane(a, 0) as u32 }
-    }
     simd_extract(a.as_i8x16(), imm as u32)
 }
 
+#[cfg(test)]
+#[assert_instr(i8x16.extract_lane_s)]
+fn i8x16_extract_lane_s(a: v128) -> i32 {
+    unsafe { i8x16_extract_lane(a, 0) as i32 }
+}
+
+#[cfg(test)]
+#[assert_instr(i8x16.extract_lane_u)]
+fn i8x16_extract_lane_u(a: v128) -> u32 {
+    unsafe { i8x16_extract_lane(a, 0) as u8 as u32 }
+}
+
 /// Replaces a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
 ///
 /// Replaces the scalar value of lane specified in the immediate mode operand
@@ -268,15 +520,6 @@ pub unsafe fn i8x16_replace_lane(a: v128, imm: usize, val: i8) -> v128 {
     transmute(simd_insert(a.as_i8x16(), imm as u32, val))
 }
 
-/// Creates a vector with identical lanes.
-///
-/// Construct a vector with `x` replicated to all 8 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(i16x8.splat))]
-pub fn i16x8_splat(a: i16) -> v128 {
-    unsafe { transmute(i16x8::splat(a)) }
-}
-
 /// Extracts a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
 ///
 /// Extracts a the scalar value of lane specified in the immediate mode operand
@@ -289,20 +532,21 @@ pub fn i16x8_splat(a: i16) -> v128 {
 #[inline]
 #[rustc_args_required_const(1)]
 pub unsafe fn i16x8_extract_lane(a: v128, imm: usize) -> i16 {
-    #[cfg(test)]
-    #[assert_instr(i16x8.extract_lane_s)]
-    fn extract_lane_s(a: v128) -> i32 {
-        unsafe { i16x8_extract_lane(a, 0) as i32 }
-    }
-    #[cfg(test)]
-    #[cfg(not(only_node_compatible_functions))]
-    #[assert_instr(i16x8.extract_lane_u)]
-    fn extract_lane_u(a: v128) -> u32 {
-        unsafe { i16x8_extract_lane(a, 0) as u32 }
-    }
     simd_extract(a.as_i16x8(), imm as u32)
 }
 
+#[cfg(test)]
+#[assert_instr(i16x8.extract_lane_s)]
+fn i16x8_extract_lane_s(a: v128) -> i32 {
+    unsafe { i16x8_extract_lane(a, 0) as i32 }
+}
+
+#[cfg(test)]
+#[assert_instr(i16x8.extract_lane_u)]
+fn i16x8_extract_lane_u(a: v128) -> u32 {
+    unsafe { i16x8_extract_lane(a, 0) as u16 as u32 }
+}
+
 /// Replaces a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
 ///
 /// Replaces the scalar value of lane specified in the immediate mode operand
@@ -319,15 +563,6 @@ pub unsafe fn i16x8_replace_lane(a: v128, imm: usize, val: i16) -> v128 {
     transmute(simd_insert(a.as_i16x8(), imm as u32, val))
 }
 
-/// Creates a vector with identical lanes.
-///
-/// Constructs a vector with `x` replicated to all 4 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(i32x4.splat))]
-pub fn i32x4_splat(a: i32) -> v128 {
-    unsafe { transmute(i32x4::splat(a)) }
-}
-
 /// Extracts a lane from a 128-bit vector interpreted as 4 packed i32 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
@@ -360,16 +595,6 @@ pub unsafe fn i32x4_replace_lane(a: v128, imm: usize, val: i32) -> v128 {
     transmute(simd_insert(a.as_i32x4(), imm as u32, val))
 }
 
-/// Creates a vector with identical lanes.
-///
-/// Construct a vector with `x` replicated to all 2 lanes.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i8x16.splat))]
-pub fn i64x2_splat(a: i64) -> v128 {
-    unsafe { transmute(i64x2::splat(a)) }
-}
-
 /// Extracts a lane from a 128-bit vector interpreted as 2 packed i64 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
@@ -380,8 +605,7 @@ pub fn i64x2_splat(a: i64) -> v128 {
 /// This function has undefined behavior if `imm` is greater than or equal to
 /// 2.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i64x2.extract_lane_s, imm = 0))]
+#[cfg_attr(test, assert_instr(i64x2.extract_lane, imm = 0))]
 #[rustc_args_required_const(1)]
 pub unsafe fn i64x2_extract_lane(a: v128, imm: usize) -> i64 {
     simd_extract(a.as_i64x2(), imm as u32)
@@ -397,22 +621,12 @@ pub unsafe fn i64x2_extract_lane(a: v128, imm: usize) -> i64 {
 /// This function has undefined behavior if `imm` is greater than or equal to
 /// 2.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i64x2.replace_lane, imm = 0))]
 #[rustc_args_required_const(1)]
 pub unsafe fn i64x2_replace_lane(a: v128, imm: usize, val: i64) -> v128 {
     transmute(simd_insert(a.as_i64x2(), imm as u32, val))
 }
 
-/// Creates a vector with identical lanes.
-///
-/// Constructs a vector with `x` replicated to all 4 lanes.
-#[inline]
-#[cfg_attr(test, assert_instr(f32x4.splat))]
-pub fn f32x4_splat(a: f32) -> v128 {
-    unsafe { transmute(f32x4::splat(a)) }
-}
-
 /// Extracts a lane from a 128-bit vector interpreted as 4 packed f32 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
@@ -445,16 +659,6 @@ pub unsafe fn f32x4_replace_lane(a: v128, imm: usize, val: f32) -> v128 {
     transmute(simd_insert(a.as_f32x4(), imm as u32, val))
 }
 
-/// Creates a vector with identical lanes.
-///
-/// Constructs a vector with `x` replicated to all 2 lanes.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(f64x2.splat))]
-pub fn f64x2_splat(a: f64) -> v128 {
-    unsafe { transmute(f64x2::splat(a)) }
-}
-
 /// Extracts lane from a 128-bit vector interpreted as 2 packed f64 numbers.
 ///
 /// Extracts the scalar value of lane specified in the immediate mode operand
@@ -465,8 +669,7 @@ pub fn f64x2_splat(a: f64) -> v128 {
 /// This function has undefined behavior if `imm` is greater than or equal to
 /// 2.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(f64x2.extract_lane_s, imm = 0))]
+#[cfg_attr(test, assert_instr(f64x2.extract_lane, imm = 0))]
 #[rustc_args_required_const(1)]
 pub unsafe fn f64x2_extract_lane(a: v128, imm: usize) -> f64 {
     simd_extract(a.as_f64x2(), imm as u32)
@@ -482,7 +685,6 @@ pub unsafe fn f64x2_extract_lane(a: v128, imm: usize) -> f64 {
 /// This function has undefined behavior if `imm` is greater than or equal to
 /// 2.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.replace_lane, imm = 0))]
 #[rustc_args_required_const(1)]
 pub unsafe fn f64x2_replace_lane(a: v128, imm: usize, val: f64) -> v128 {
@@ -891,7 +1093,6 @@ pub fn f32x4_ge(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise elements
 /// were equal, or all zeros if the elements were not equal.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.eq))]
 pub fn f64x2_eq(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_eq::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -903,7 +1104,6 @@ pub fn f64x2_eq(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise elements
 /// were not equal, or all zeros if the elements were equal.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.ne))]
 pub fn f64x2_ne(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_ne::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -915,7 +1115,6 @@ pub fn f64x2_ne(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise left
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.lt))]
 pub fn f64x2_lt(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_lt::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -927,7 +1126,6 @@ pub fn f64x2_lt(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise left
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.gt))]
 pub fn f64x2_gt(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_gt::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -939,7 +1137,6 @@ pub fn f64x2_gt(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise left
 /// element is less than the pairwise right element, or all zeros otherwise.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.le))]
 pub fn f64x2_le(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_le::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -951,7 +1148,6 @@ pub fn f64x2_le(a: v128, b: v128) -> v128 {
 /// Returns a new vector where each lane is all ones if the pairwise left
 /// element is greater than the pairwise right element, or all zeros otherwise.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.ge))]
 pub fn f64x2_ge(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_ge::<_, i64x2>(a.as_f64x2(), b.as_f64x2())) }
@@ -972,6 +1168,20 @@ pub fn v128_and(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_and(a.as_i64x2(), b.as_i64x2())) }
 }
 
+/// Bitwise AND of bits of `a` and the logical inverse of bits of `b`.
+///
+/// This operation is equivalent to `v128.and(a, v128.not(b))`
+#[inline]
+#[cfg_attr(test, assert_instr(v128.andnot))]
+pub fn v128_andnot(a: v128, b: v128) -> v128 {
+    unsafe {
+        transmute(simd_and(
+            a.as_i64x2(),
+            simd_xor(b.as_i64x2(), i64x2(-1, -1)),
+        ))
+    }
+}
+
 /// Performs a bitwise or of the two input 128-bit vectors, returning the
 /// resulting vector.
 #[inline]
@@ -992,7 +1202,22 @@ pub fn v128_xor(a: v128, b: v128) -> v128 {
 #[inline]
 #[cfg_attr(test, assert_instr(v128.bitselect))]
 pub fn v128_bitselect(v1: v128, v2: v128, c: v128) -> v128 {
-    unsafe { transmute(llvm_bitselect(c.as_i8x16(), v1.as_i8x16(), v2.as_i8x16())) }
+    unsafe { transmute(llvm_bitselect(v1.as_i8x16(), v2.as_i8x16(), c.as_i8x16())) }
+}
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+// #[cfg_attr(test, assert_instr(i8x16.abs))] // FIXME support not in our LLVM yet
+pub fn i8x16_abs(a: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i8x16>(a);
+        let zero = i8x16::splat(0);
+        transmute(simd_select::<m8x16, i8x16>(
+            simd_lt(a, zero),
+            simd_sub(zero, a),
+            a,
+        ))
+    }
 }
 
 /// Negates a 128-bit vectors intepreted as sixteen 8-bit signed integers
@@ -1016,12 +1241,42 @@ pub fn i8x16_all_true(a: v128) -> i32 {
     unsafe { llvm_i8x16_all_true(a.as_i8x16()) }
 }
 
+// FIXME: not available in our LLVM yet
+// /// Extracts the high bit for each lane in `a` and produce a scalar mask with
+// /// all bits concatenated.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i8x16.all_true))]
+// pub fn i8x16_bitmask(a: v128) -> i32 {
+//     unsafe { llvm_bitmask_i8x16(transmute(a)) }
+// }
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x7f or 0x80 is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.narrow_i16x8_s))]
+pub fn i8x16_narrow_i16x8_s(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_narrow_i8x16_s(transmute(a), transmute(b))) }
+}
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x00 or 0xff is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.narrow_i16x8_u))]
+pub fn i8x16_narrow_i16x8_u(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_narrow_i8x16_u(transmute(a), transmute(b))) }
+}
+
 /// Shifts each lane to the left by the specified number of bits.
 ///
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i8x16.shl))]
 pub fn i8x16_shl(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shl(a.as_i8x16(), i8x16::splat(amt as i8))) }
@@ -1033,8 +1288,7 @@ pub fn i8x16_shl(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i8x16.shl))]
+#[cfg_attr(test, assert_instr(i8x16.shr_s))]
 pub fn i8x16_shr_s(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_i8x16(), i8x16::splat(amt as i8))) }
 }
@@ -1045,8 +1299,7 @@ pub fn i8x16_shr_s(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i8x16.shl))]
+#[cfg_attr(test, assert_instr(i8x16.shr_u))]
 pub fn i8x16_shr_u(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_u8x16(), u8x16::splat(amt as u8))) }
 }
@@ -1097,12 +1350,74 @@ pub fn i8x16_sub_saturate_u(a: v128, b: v128) -> v128 {
     unsafe { transmute(llvm_i8x16_sub_saturate_u(a.as_i8x16(), b.as_i8x16())) }
 }
 
-/// Multiplies two 128-bit vectors as if they were two packed sixteen 8-bit
-/// signed integers.
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.min_s))]
+pub fn i8x16_min_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = a.as_i8x16();
+        let b = b.as_i8x16();
+        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.min_u))]
+pub fn i8x16_min_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u8x16>(a);
+        let b = transmute::<_, u8x16>(b);
+        transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.max_s))]
+pub fn i8x16_max_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i8x16>(a);
+        let b = transmute::<_, i8x16>(b);
+        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i8x16.max_u))]
+pub fn i8x16_max_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u8x16>(a);
+        let b = transmute::<_, u8x16>(b);
+        transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Lane-wise rounding average.
 #[inline]
-#[cfg_attr(test, assert_instr(i8x16.mul))]
-pub fn i8x16_mul(a: v128, b: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i8x16(), b.as_i8x16())) }
+#[cfg_attr(test, assert_instr(i8x16.avgr_u))]
+pub fn i8x16_avgr_u(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_avgr_u_i8x16(transmute(a), transmute(b))) }
+}
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+// #[cfg_attr(test, assert_instr(i16x8.abs))] // FIXME support not in our LLVM yet
+pub fn i16x8_abs(a: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i16x8>(a);
+        let zero = i16x8::splat(0);
+        transmute(simd_select::<m16x8, i16x8>(
+            simd_lt(a, zero),
+            simd_sub(zero, a),
+            a,
+        ))
+    }
 }
 
 /// Negates a 128-bit vectors intepreted as eight 16-bit signed integers
@@ -1126,12 +1441,75 @@ pub fn i16x8_all_true(a: v128) -> i32 {
     unsafe { llvm_i16x8_all_true(a.as_i16x8()) }
 }
 
+// FIXME: not available in our LLVM yet
+// /// Extracts the high bit for each lane in `a` and produce a scalar mask with
+// /// all bits concatenated.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i16x8.all_true))]
+// pub fn i16x8_bitmask(a: v128) -> i32 {
+//     unsafe { llvm_bitmask_i16x8(transmute(a)) }
+// }
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x7fff or 0x8000 is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.narrow_i32x4_s))]
+pub fn i16x8_narrow_i32x4_s(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_narrow_i16x8_s(transmute(a), transmute(b))) }
+}
+
+/// Converts two input vectors into a smaller lane vector by narrowing each
+/// lane.
+///
+/// Signed saturation to 0x0000 or 0xffff is used and the input lanes are always
+/// interpreted as signed integers.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.narrow_i32x4_u))]
+pub fn i16x8_narrow_i32x4_u(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_narrow_i16x8_u(transmute(a), transmute(b))) }
+}
+
+// FIXME waiting on a runtime implementation to test
+// /// Converts low half of the smaller lane vector to a larger lane
+// /// vector, sign extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_s))]
+// pub fn i16x8_widen_low_i8x16_s(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_low_i16x8_s(transmute(a))) }
+// }
+
+// /// Converts high half of the smaller lane vector to a larger lane
+// /// vector, sign extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_s))]
+// pub fn i16x8_widen_high_i8x16_s(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_high_i16x8_s(transmute(a))) }
+// }
+
+// /// Converts low half of the smaller lane vector to a larger lane
+// /// vector, zero extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i16x8.widen_low_i8x16_u))]
+// pub fn i16x8_widen_low_i8x16_u(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_low_i16x8_u(transmute(a))) }
+// }
+
+// /// Converts high half of the smaller lane vector to a larger lane
+// /// vector, zero extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i16x8.widen_high_i8x16_u))]
+// pub fn i16x8_widen_high_i8x16_u(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_high_i16x8_u(transmute(a))) }
+// }
+
 /// Shifts each lane to the left by the specified number of bits.
 ///
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i16x8.shl))]
 pub fn i16x8_shl(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shl(a.as_i16x8(), i16x8::splat(amt as i16))) }
@@ -1143,8 +1521,7 @@ pub fn i16x8_shl(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i16x8.shl))]
+#[cfg_attr(test, assert_instr(i16x8.shr_s))]
 pub fn i16x8_shr_s(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_i16x8(), i16x8::splat(amt as i16))) }
 }
@@ -1155,8 +1532,7 @@ pub fn i16x8_shr_s(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i16x8.shl))]
+#[cfg_attr(test, assert_instr(i16x8.shr_u))]
 pub fn i16x8_shr_u(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_u16x8(), u16x8::splat(amt as u16))) }
 }
@@ -1215,6 +1591,76 @@ pub fn i16x8_mul(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_mul(a.as_i16x8(), b.as_i16x8())) }
 }
 
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.min_s))]
+pub fn i16x8_min_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i16x8>(a);
+        let b = transmute::<_, i16x8>(b);
+        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.min_u))]
+pub fn i16x8_min_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u16x8>(a);
+        let b = transmute::<_, u16x8>(b);
+        transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.max_s))]
+pub fn i16x8_max_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i16x8>(a);
+        let b = transmute::<_, i16x8>(b);
+        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.max_u))]
+pub fn i16x8_max_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u16x8>(a);
+        let b = transmute::<_, u16x8>(b);
+        transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Lane-wise rounding average.
+#[inline]
+#[cfg_attr(test, assert_instr(i16x8.avgr_u))]
+pub fn i16x8_avgr_u(a: v128, b: v128) -> v128 {
+    unsafe { transmute(llvm_avgr_u_i16x8(transmute(a), transmute(b))) }
+}
+
+/// Lane-wise wrapping absolute value.
+#[inline]
+// #[cfg_attr(test, assert_instr(i32x4.abs))] // FIXME support not in our LLVM yet
+pub fn i32x4_abs(a: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i32x4>(a);
+        let zero = i32x4::splat(0);
+        transmute(simd_select::<m32x4, i32x4>(
+            simd_lt(a, zero),
+            simd_sub(zero, a),
+            a,
+        ))
+    }
+}
+
 /// Negates a 128-bit vectors intepreted as four 32-bit signed integers
 #[inline]
 #[cfg_attr(test, assert_instr(i32x4.neg))]
@@ -1236,12 +1682,53 @@ pub fn i32x4_all_true(a: v128) -> i32 {
     unsafe { llvm_i32x4_all_true(a.as_i32x4()) }
 }
 
+// FIXME: not available in our LLVM yet
+// /// Extracts the high bit for each lane in `a` and produce a scalar mask with
+// /// all bits concatenated.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i32x4.all_true))]
+// pub fn i32x4_bitmask(a: v128) -> i32 {
+//     unsafe { llvm_bitmask_i32x4(transmute(a)) }
+// }
+
+// FIXME waiting on a runtime implementation to test
+// /// Converts low half of the smaller lane vector to a larger lane
+// /// vector, sign extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_s))]
+// pub fn i32x4_widen_low_i16x8_s(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_low_i32x4_s(transmute(a))) }
+// }
+
+// /// Converts high half of the smaller lane vector to a larger lane
+// /// vector, sign extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_s))]
+// pub fn i32x4_widen_high_i16x8_s(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_high_i32x4_s(transmute(a))) }
+// }
+
+// /// Converts low half of the smaller lane vector to a larger lane
+// /// vector, zero extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i32x4.widen_low_i16x8_u))]
+// pub fn i32x4_widen_low_i16x8_u(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_low_i32x4_u(transmute(a))) }
+// }
+
+// /// Converts high half of the smaller lane vector to a larger lane
+// /// vector, zero extended.
+// #[inline]
+// #[cfg_attr(test, assert_instr(i32x4.widen_high_i16x8_u))]
+// pub fn i32x4_widen_high_i16x8_u(a: v128) -> v128 {
+//     unsafe { transmute(llvm_widen_high_i32x4_u(transmute(a))) }
+// }
+
 /// Shifts each lane to the left by the specified number of bits.
 ///
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i32x4.shl))]
 pub fn i32x4_shl(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shl(a.as_i32x4(), i32x4::splat(amt as i32))) }
@@ -1253,8 +1740,7 @@ pub fn i32x4_shl(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i32x4.shl))]
+#[cfg_attr(test, assert_instr(i32x4.shr_s))]
 pub fn i32x4_shr_s(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_i32x4(), i32x4::splat(amt as i32))) }
 }
@@ -1265,8 +1751,7 @@ pub fn i32x4_shr_s(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i32x4.shl))]
+#[cfg_attr(test, assert_instr(i32x4.shr_u))]
 pub fn i32x4_shr_u(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_u32x4(), u32x4::splat(amt as u32))) }
 }
@@ -1293,28 +1778,59 @@ pub fn i32x4_mul(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_mul(a.as_i32x4(), b.as_i32x4())) }
 }
 
-/// Negates a 128-bit vectors intepreted as two 64-bit signed integers
+/// Compares lane-wise signed integers, and returns the minimum of
+/// each pair.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i32x4.neg))]
-pub fn i64x2_neg(a: v128) -> v128 {
-    unsafe { transmute(simd_mul(a.as_i64x2(), i64x2::splat(-1))) }
+#[cfg_attr(test, assert_instr(i32x4.min_s))]
+pub fn i32x4_min_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i32x4>(a);
+        let b = transmute::<_, i32x4>(b);
+        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
+    }
 }
 
-/// Returns 1 if any lane is nonzero or 0 if all lanes are zero.
+/// Compares lane-wise unsigned integers, and returns the minimum of
+/// each pair.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i64x2.any_true))]
-pub fn i64x2_any_true(a: v128) -> i32 {
-    unsafe { llvm_i64x2_any_true(a.as_i64x2()) }
+#[cfg_attr(test, assert_instr(i32x4.min_u))]
+pub fn i32x4_min_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u32x4>(a);
+        let b = transmute::<_, u32x4>(b);
+        transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
+    }
 }
 
-/// Returns 1 if all lanes are nonzero or 0 if any lane is nonzero.
+/// Compares lane-wise signed integers, and returns the maximum of
+/// each pair.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i64x2.all_true))]
-pub fn i64x2_all_true(a: v128) -> i32 {
-    unsafe { llvm_i64x2_all_true(a.as_i64x2()) }
+#[cfg_attr(test, assert_instr(i32x4.max_s))]
+pub fn i32x4_max_s(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, i32x4>(a);
+        let b = transmute::<_, i32x4>(b);
+        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Compares lane-wise unsigned integers, and returns the maximum of
+/// each pair.
+#[inline]
+#[cfg_attr(test, assert_instr(i32x4.max_u))]
+pub fn i32x4_max_u(a: v128, b: v128) -> v128 {
+    unsafe {
+        let a = transmute::<_, u32x4>(a);
+        let b = transmute::<_, u32x4>(b);
+        transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
+    }
+}
+
+/// Negates a 128-bit vectors intepreted as two 64-bit signed integers
+#[inline]
+#[cfg_attr(test, assert_instr(i64x2.neg))]
+pub fn i64x2_neg(a: v128) -> v128 {
+    unsafe { transmute(simd_mul(a.as_i64x2(), i64x2::splat(-1))) }
 }
 
 /// Shifts each lane to the left by the specified number of bits.
@@ -1322,7 +1838,6 @@ pub fn i64x2_all_true(a: v128) -> i32 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i64x2.shl))]
 pub fn i64x2_shl(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shl(a.as_i64x2(), i64x2::splat(amt as i64))) }
@@ -1334,8 +1849,7 @@ pub fn i64x2_shl(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i64x2.shl))]
+#[cfg_attr(test, assert_instr(i64x2.shr_s))]
 pub fn i64x2_shr_s(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_i64x2(), i64x2::splat(amt as i64))) }
 }
@@ -1346,15 +1860,13 @@ pub fn i64x2_shr_s(a: v128, amt: u32) -> v128 {
 /// Only the low bits of the shift amount are used if the shift amount is
 /// greater than the lane width.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(i64x2.shl))]
+#[cfg_attr(test, assert_instr(i64x2.shr_u))]
 pub fn i64x2_shr_u(a: v128, amt: u32) -> v128 {
     unsafe { transmute(simd_shr(a.as_u64x2(), u64x2::splat(amt as u64))) }
 }
 
 /// Adds two 128-bit vectors as if they were two packed two 64-bit integers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i64x2.add))]
 pub fn i64x2_add(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_add(a.as_i64x2(), b.as_i64x2())) }
@@ -1362,12 +1874,18 @@ pub fn i64x2_add(a: v128, b: v128) -> v128 {
 
 /// Subtracts two 128-bit vectors as if they were two packed two 64-bit integers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(i64x2.sub))]
 pub fn i64x2_sub(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_sub(a.as_i64x2(), b.as_i64x2())) }
 }
 
+/// Multiplies two 128-bit vectors as if they were two packed two 64-bit integers.
+#[inline]
+// #[cfg_attr(test, assert_instr(i64x2.mul))] // FIXME: not present in our LLVM
+pub fn i64x2_mul(a: v128, b: v128) -> v128 {
+    unsafe { transmute(simd_mul(a.as_i64x2(), b.as_i64x2())) }
+}
+
 /// Calculates the absolute value of each lane of a 128-bit vector interpreted
 /// as four 32-bit floating point numbers.
 #[inline]
@@ -1387,7 +1905,6 @@ pub fn f32x4_neg(a: v128) -> v128 {
 /// Calculates the square root of each lane of a 128-bit vector interpreted as
 /// four 32-bit floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f32x4.sqrt))]
 pub fn f32x4_sqrt(a: v128) -> v128 {
     unsafe { transmute(llvm_f32x4_sqrt(a.as_f32x4())) }
@@ -1420,7 +1937,6 @@ pub fn f32x4_mul(a: v128, b: v128) -> v128 {
 /// Divides pairwise lanes of two 128-bit vectors interpreted as four 32-bit
 /// floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f32x4.div))]
 pub fn f32x4_div(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_div(a.as_f32x4(), b.as_f32x4())) }
@@ -1445,7 +1961,6 @@ pub fn f32x4_max(a: v128, b: v128) -> v128 {
 /// Calculates the absolute value of each lane of a 128-bit vector interpreted
 /// as two 64-bit floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.abs))]
 pub fn f64x2_abs(a: v128) -> v128 {
     unsafe { transmute(llvm_f64x2_abs(a.as_f64x2())) }
@@ -1454,8 +1969,7 @@ pub fn f64x2_abs(a: v128) -> v128 {
 /// Negates each lane of a 128-bit vector interpreted as two 64-bit floating
 /// point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr(f64x2.abs))]
+#[cfg_attr(test, assert_instr(f64x2.neg))]
 pub fn f64x2_neg(a: v128) -> v128 {
     unsafe { f64x2_mul(a, transmute(f64x2(-1.0, -1.0))) }
 }
@@ -1463,7 +1977,6 @@ pub fn f64x2_neg(a: v128) -> v128 {
 /// Calculates the square root of each lane of a 128-bit vector interpreted as
 /// two 64-bit floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.sqrt))]
 pub fn f64x2_sqrt(a: v128) -> v128 {
     unsafe { transmute(llvm_f64x2_sqrt(a.as_f64x2())) }
@@ -1472,7 +1985,6 @@ pub fn f64x2_sqrt(a: v128) -> v128 {
 /// Adds pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.add))]
 pub fn f64x2_add(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_add(a.as_f64x2(), b.as_f64x2())) }
@@ -1481,7 +1993,6 @@ pub fn f64x2_add(a: v128, b: v128) -> v128 {
 /// Subtracts pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.sub))]
 pub fn f64x2_sub(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_sub(a.as_f64x2(), b.as_f64x2())) }
@@ -1490,7 +2001,6 @@ pub fn f64x2_sub(a: v128, b: v128) -> v128 {
 /// Multiplies pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.mul))]
 pub fn f64x2_mul(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_mul(a.as_f64x2(), b.as_f64x2())) }
@@ -1499,7 +2009,6 @@ pub fn f64x2_mul(a: v128, b: v128) -> v128 {
 /// Divides pairwise lanes of two 128-bit vectors interpreted as two 64-bit
 /// floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.div))]
 pub fn f64x2_div(a: v128, b: v128) -> v128 {
     unsafe { transmute(simd_div(a.as_f64x2(), b.as_f64x2())) }
@@ -1508,7 +2017,6 @@ pub fn f64x2_div(a: v128, b: v128) -> v128 {
 /// Calculates the minimum of pairwise lanes of two 128-bit vectors interpreted
 /// as two 64-bit floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.min))]
 pub fn f64x2_min(a: v128, b: v128) -> v128 {
     unsafe { transmute(llvm_f64x2_min(a.as_f64x2(), b.as_f64x2())) }
@@ -1517,7 +2025,6 @@ pub fn f64x2_min(a: v128, b: v128) -> v128 {
 /// Calculates the maximum of pairwise lanes of two 128-bit vectors interpreted
 /// as two 64-bit floating point numbers.
 #[inline]
-#[cfg(not(only_node_compatible_functions))]
 #[cfg_attr(test, assert_instr(f64x2.max))]
 pub fn f64x2_max(a: v128, b: v128) -> v128 {
     unsafe { transmute(llvm_f64x2_max(a.as_f64x2(), b.as_f64x2())) }
@@ -1530,7 +2037,7 @@ pub fn f64x2_max(a: v128, b: v128) -> v128 {
 /// representable intger.
 #[inline]
 #[cfg_attr(test, assert_instr("i32x4.trunc_sat_f32x4_s"))]
-pub fn i32x4_trunc_s_f32x4_sat(a: v128) -> v128 {
+pub fn i32x4_trunc_sat_f32x4_s(a: v128) -> v128 {
     unsafe { transmute(simd_cast::<_, i32x4>(a.as_f32x4())) }
 }
 
@@ -1541,34 +2048,10 @@ pub fn i32x4_trunc_s_f32x4_sat(a: v128) -> v128 {
 /// representable intger.
 #[inline]
 #[cfg_attr(test, assert_instr("i32x4.trunc_sat_f32x4_u"))]
-pub fn i32x4_trunc_u_f32x4_sat(a: v128) -> v128 {
+pub fn i32x4_trunc_sat_f32x4_su(a: v128) -> v128 {
     unsafe { transmute(simd_cast::<_, u32x4>(a.as_f32x4())) }
 }
 
-/// Converts a 128-bit vector interpreted as two 64-bit floating point numbers
-/// into a 128-bit vector of two 64-bit signed integers.
-///
-/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
-/// representable intger.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr("i64x2.trunc_s/f64x2:sat"))]
-pub fn i64x2_trunc_s_f64x2_sat(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, i64x2>(a.as_f64x2())) }
-}
-
-/// Converts a 128-bit vector interpreted as two 64-bit floating point numbers
-/// into a 128-bit vector of two 64-bit unsigned integers.
-///
-/// NaN is converted to 0 and if it's out of bounds it becomes the nearest
-/// representable intger.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr("i64x2.trunc_u/f64x2:sat"))]
-pub fn i64x2_trunc_u_f64x2_sat(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, u64x2>(a.as_f64x2())) }
-}
-
 /// Converts a 128-bit vector interpreted as four 32-bit signed integers into a
 /// 128-bit vector of four 32-bit floating point numbers.
 #[inline]
@@ -1585,24 +2068,6 @@ pub fn f32x4_convert_i32x4_u(a: v128) -> v128 {
     unsafe { transmute(simd_cast::<_, f32x4>(a.as_u32x4())) }
 }
 
-/// Converts a 128-bit vector interpreted as two 64-bit signed integers into a
-/// 128-bit vector of two 64-bit floating point numbers.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr("f64x2.convert_s/i64x2"))]
-pub fn f64x2_convert_s_i64x2(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, f64x2>(a.as_i64x2())) }
-}
-
-/// Converts a 128-bit vector interpreted as two 64-bit unsigned integers into a
-/// 128-bit vector of two 64-bit floating point numbers.
-#[inline]
-#[cfg(not(only_node_compatible_functions))]
-#[cfg_attr(test, assert_instr("f64x2.convert_u/i64x2"))]
-pub fn f64x2_convert_u_i64x2(a: v128) -> v128 {
-    unsafe { transmute(simd_cast::<_, f64x2>(a.as_u64x2())) }
-}
-
 #[cfg(test)]
 pub mod tests {
     use super::*;
@@ -1610,7 +2075,6 @@ pub mod tests {
     use std::mem;
     use std::num::Wrapping;
     use std::prelude::v1::*;
-    use wasm_bindgen_test::*;
 
     fn compare_bytes(a: v128, b: v128) {
         let a: [u8; 16] = unsafe { transmute(a) };
@@ -1618,17 +2082,15 @@ pub mod tests {
         assert_eq!(a, b);
     }
 
-    #[wasm_bindgen_test]
-    #[cfg(not(only_node_compatible_functions))]
+    #[test]
     fn test_v128_const() {
-        const A: v128 =
-            unsafe { super::v128_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) };
+        const A: v128 = super::v128_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
         compare_bytes(A, A);
     }
 
     macro_rules! test_splat {
         ($test_id:ident: $val:expr => $($vals:expr),*) => {
-            #[wasm_bindgen_test]
+            #[test]
             fn $test_id() {
                 let a = super::$test_id($val);
                 let b: v128 = unsafe {
@@ -1642,10 +2104,8 @@ pub mod tests {
     test_splat!(i8x16_splat: 42 => 42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42);
     test_splat!(i16x8_splat: 42 => 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0, 42, 0);
     test_splat!(i32x4_splat: 42 => 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0, 42, 0, 0, 0);
-    #[cfg(not(only_node_compatible_functions))]
     test_splat!(i64x2_splat: 42 => 42, 0, 0, 0, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0);
     test_splat!(f32x4_splat: 42. => 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66, 0, 0, 40, 66);
-    #[cfg(not(only_node_compatible_functions))]
     test_splat!(f64x2_splat: 42. => 0, 0, 0, 0, 0, 0, 69, 64, 0, 0, 0, 0, 0, 0, 69, 64);
 
     // tests extract and replace lanes
@@ -1658,14 +2118,14 @@ pub mod tests {
             count: $count:expr,
             indices: [$($idx:expr),*],
         ) => {
-            #[wasm_bindgen_test]
+            #[test]
             fn $test_id() {
                 unsafe {
                     let arr: [$elem; $count] = [123 as $elem; $count];
                     let vec: v128 = transmute(arr);
                     $(
                         assert_eq!($extract(vec, $idx), 123 as $elem);
-                    )*;
+                    )*
 
                     // create a vector from array and check that the indices contain
                     // the same values as in the array:
@@ -1676,7 +2136,7 @@ pub mod tests {
 
                         let tmp = $replace(vec, $idx, 124 as $elem);
                         assert_eq!($extract(tmp, $idx), 124 as $elem);
-                    )*;
+                    )*
                 }
             }
         }
@@ -1706,7 +2166,6 @@ pub mod tests {
         count: 4,
         indices: [0, 1, 2, 3],
     }
-    #[cfg(not(only_node_compatible_functions))]
     test_extract! {
         name: test_i64x2_extract_replace,
         extract: i64x2_extract_lane,
@@ -1723,7 +2182,6 @@ pub mod tests {
         count: 4,
         indices: [0, 1, 2, 3],
     }
-    #[cfg(not(only_node_compatible_functions))]
     test_extract! {
         name: test_f64x2_extract_replace,
         extract: f64x2_extract_lane,
@@ -1739,7 +2197,7 @@ pub mod tests {
                 $([$($vec1:tt)*] ($op:tt | $f:ident) [$($vec2:tt)*],)*
             })*
         ) => ($(
-            #[wasm_bindgen_test]
+            #[test]
             fn $name() {
                 unsafe {
                     $(
@@ -1768,7 +2226,7 @@ pub mod tests {
                 $(($op:tt | $f:ident) [$($vec1:tt)*],)*
             })*
         ) => ($(
-            #[wasm_bindgen_test]
+            #[test]
             fn $name() {
                 unsafe {
                     $(
@@ -1816,19 +2274,6 @@ pub mod tests {
                 (- | i8x16_sub)
             [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 4, 8],
         }
-        test_i8x16_mul => {
-            [0i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-                (* | i8x16_mul)
-            [1i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-
-            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
-                (* | i8x16_mul)
-            [-2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -18],
-
-            [1i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
-                (* | i8x16_mul)
-            [-127, -44, 43, 126, 4, 2, 9, -3, -59, -43, 39, -69, 79, -3, 30, 3],
-        }
 
         test_i16x8_add => {
             [0i16, 0, 0, 0, 0, 0, 0, 0]
@@ -1910,425 +2355,401 @@ pub mod tests {
         // TODO: test_i64x2_neg
     }
 
-    // #[wasm_bindgen_test]
-    // fn v8x16_shuffle() {
-    //     unsafe {
-    //         let a = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
-    //         let b = [
-    //             16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-    //             31,
-    //         ];
-    //
-    //         let vec_a: v128 = transmute(a);
-    //         let vec_b: v128 = transmute(b);
-    //
-    //         let vec_r = v8x16_shuffle!(
-    //             vec_a,
-    //             vec_b,
-    //             [0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30]
-    //         );
-    //
-    //         let e =
-    //             [0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30];
-    //         let vec_e: v128 = transmute(e);
-    //         compare_bytes(vec_r, vec_e);
-    //     }
-    // }
-    //
-    // macro_rules! floating_point {
-    //     (f32) => {
-    //         true
-    //     };
-    //     (f64) => {
-    //         true
-    //     };
-    //     ($id:ident) => {
-    //         false
-    //     };
-    // }
-    //
-    // trait IsNan: Sized {
-    //     fn is_nan(self) -> bool {
-    //         false
-    //     }
-    // }
-    // impl IsNan for i8 {}
-    // impl IsNan for i16 {}
-    // impl IsNan for i32 {}
-    // impl IsNan for i64 {}
-    //
-    // macro_rules! test_bop {
-    //     ($id:ident[$ety:ident; $ecount:expr] |
-    //      $binary_op:ident [$op_test_id:ident] :
-    //      ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
-    //         test_bop!(
-    //             $id[$ety; $ecount] => $ety | $binary_op [ $op_test_id ]:
-    //             ([$($in_a),*], [$($in_b),*]) => [$($out),*]
-    //         );
-    //
-    //     };
-    //     ($id:ident[$ety:ident; $ecount:expr] => $oty:ident |
-    //      $binary_op:ident [$op_test_id:ident] :
-    //      ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
-    //         #[wasm_bindgen_test]
-    //         fn $op_test_id() {
-    //             unsafe {
-    //                 let a_input: [$ety; $ecount] = [$($in_a),*];
-    //                 let b_input: [$ety; $ecount] = [$($in_b),*];
-    //                 let output: [$oty; $ecount] = [$($out),*];
-    //
-    //                 let a_vec_in: v128 = transmute(a_input);
-    //                 let b_vec_in: v128 = transmute(b_input);
-    //                 let vec_res: v128 = $id::$binary_op(a_vec_in, b_vec_in);
-    //
-    //                 let res: [$oty; $ecount] = transmute(vec_res);
-    //
-    //                 if !floating_point!($ety) {
-    //                     assert_eq!(res, output);
-    //                 } else {
-    //                     for i in 0..$ecount {
-    //                         let r = res[i];
-    //                         let o = output[i];
-    //                         assert_eq!(r.is_nan(), o.is_nan());
-    //                         if !r.is_nan() {
-    //                             assert_eq!(r, o);
-    //                         }
-    //                     }
-    //                 }
-    //             }
-    //         }
-    //     }
-    // }
-    //
-    // macro_rules! test_bops {
-    //     ($id:ident[$ety:ident; $ecount:expr] |
-    //      $binary_op:ident [$op_test_id:ident]:
-    //      ([$($in_a:expr),*], $in_b:expr) => [$($out:expr),*]) => {
-    //         #[wasm_bindgen_test]
-    //         fn $op_test_id() {
-    //             unsafe {
-    //                 let a_input: [$ety; $ecount] = [$($in_a),*];
-    //                 let output: [$ety; $ecount] = [$($out),*];
-    //
-    //                 let a_vec_in: v128 = transmute(a_input);
-    //                 let vec_res: v128 = $id::$binary_op(a_vec_in, $in_b);
-    //
-    //                 let res: [$ety; $ecount] = transmute(vec_res);
-    //                 assert_eq!(res, output);
-    //             }
-    //         }
-    //     }
-    // }
-    //
-    // macro_rules! test_uop {
-    //     ($id:ident[$ety:ident; $ecount:expr] |
-    //      $unary_op:ident [$op_test_id:ident]: [$($in_a:expr),*] => [$($out:expr),*]) => {
-    //         #[wasm_bindgen_test]
-    //         fn $op_test_id() {
-    //             unsafe {
-    //                 let a_input: [$ety; $ecount] = [$($in_a),*];
-    //                 let output: [$ety; $ecount] = [$($out),*];
-    //
-    //                 let a_vec_in: v128 = transmute(a_input);
-    //                 let vec_res: v128 = $id::$unary_op(a_vec_in);
-    //
-    //                 let res: [$ety; $ecount] = transmute(vec_res);
-    //                 assert_eq!(res, output);
-    //             }
-    //         }
-    //     }
-    // }
-    //
-    //
-    //
-    // test_bops!(i8x16[i8; 16] | shl[i8x16_shl_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
-    //            [0, -2, 4, 6, 8, 10, 12, -2, 2, 2, 2, 2, 2, 2, 2, 2]);
-    // test_bops!(i16x8[i16; 8] | shl[i16x8_shl_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
-    //            [0, -2, 4, 6, 8, 10, 12, -2]);
-    // test_bops!(i32x4[i32; 4] | shl[i32x4_shl_test]:
-    //            ([0, -1, 2, 3], 1) => [0, -2, 4, 6]);
-    // test_bops!(i64x2[i64; 2] | shl[i64x2_shl_test]:
-    //            ([0, -1], 1) => [0, -2]);
-    //
-    // test_bops!(i8x16[i8; 16] | shr_s[i8x16_shr_s_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
-    //            [0, -1, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
-    // test_bops!(i16x8[i16; 8] | shr_s[i16x8_shr_s_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
-    //            [0, -1, 1, 1, 2, 2, 3, i16::MAX / 2]);
-    // test_bops!(i32x4[i32; 4] | shr_s[i32x4_shr_s_test]:
-    //            ([0, -1, 2, 3], 1) => [0, -1, 1, 1]);
-    // test_bops!(i64x2[i64; 2] | shr_s[i64x2_shr_s_test]:
-    //            ([0, -1], 1) => [0, -1]);
-    //
-    // test_bops!(i8x16[i8; 16] | shr_u[i8x16_uhr_u_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
-    //            [0, i8::MAX, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
-    // test_bops!(i16x8[i16; 8] | shr_u[i16x8_uhr_u_test]:
-    //            ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
-    //            [0, i16::MAX, 1, 1, 2, 2, 3, i16::MAX / 2]);
-    // test_bops!(i32x4[i32; 4] | shr_u[i32x4_uhr_u_test]:
-    //            ([0, -1, 2, 3], 1) => [0, i32::MAX, 1, 1]);
-    // test_bops!(i64x2[i64; 2] | shr_u[i64x2_uhr_u_test]:
-    //            ([0, -1], 1) => [0, i64::MAX]);
-    //
-    // #[wasm_bindgen_test]
-    // fn v128_bitwise_logical_ops() {
-    //     unsafe {
-    //         let a: [u32; 4] = [u32::MAX, 0, u32::MAX, 0];
-    //         let b: [u32; 4] = [u32::MAX; 4];
-    //         let c: [u32; 4] = [0; 4];
-    //
-    //         let vec_a: v128 = transmute(a);
-    //         let vec_b: v128 = transmute(b);
-    //         let vec_c: v128 = transmute(c);
-    //
-    //         let r: v128 = v128::and(vec_a, vec_a);
-    //         compare_bytes(r, vec_a);
-    //         let r: v128 = v128::and(vec_a, vec_b);
-    //         compare_bytes(r, vec_a);
-    //         let r: v128 = v128::or(vec_a, vec_b);
-    //         compare_bytes(r, vec_b);
-    //         let r: v128 = v128::not(vec_b);
-    //         compare_bytes(r, vec_c);
-    //         let r: v128 = v128::xor(vec_a, vec_c);
-    //         compare_bytes(r, vec_a);
-    //
-    //         let r: v128 = v128::bitselect(vec_b, vec_c, vec_b);
-    //         compare_bytes(r, vec_b);
-    //         let r: v128 = v128::bitselect(vec_b, vec_c, vec_c);
-    //         compare_bytes(r, vec_c);
-    //         let r: v128 = v128::bitselect(vec_b, vec_c, vec_a);
-    //         compare_bytes(r, vec_a);
-    //     }
-    // }
-    //
-    // macro_rules! test_bool_red {
-    //     ($id:ident[$test_id:ident] | [$($true:expr),*] | [$($false:expr),*] | [$($alt:expr),*]) => {
-    //         #[wasm_bindgen_test]
-    //         fn $test_id() {
-    //             unsafe {
-    //                 let vec_a: v128 = transmute([$($true),*]); // true
-    //                 let vec_b: v128 = transmute([$($false),*]); // false
-    //                 let vec_c: v128 = transmute([$($alt),*]); // alternating
-    //
-    //                 assert_eq!($id::any_true(vec_a), 1);
-    //                 assert_eq!($id::any_true(vec_b), 0);
-    //                 assert_eq!($id::any_true(vec_c), 1);
-    //
-    //                 assert_eq!($id::all_true(vec_a), 1);
-    //                 assert_eq!($id::all_true(vec_b), 0);
-    //                 assert_eq!($id::all_true(vec_c), 0);
-    //             }
-    //         }
-    //     }
-    // }
-    //
-    // test_bool_red!(
-    //     i8x16[i8x16_boolean_reductions]
-    //         | [1_i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-    //         | [0_i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-    //         | [1_i8, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
-    // );
-    // test_bool_red!(
-    //     i16x8[i16x8_boolean_reductions]
-    //         | [1_i16, 1, 1, 1, 1, 1, 1, 1]
-    //         | [0_i16, 0, 0, 0, 0, 0, 0, 0]
-    //         | [1_i16, 0, 1, 0, 1, 0, 1, 0]
-    // );
-    // test_bool_red!(
-    //     i32x4[i32x4_boolean_reductions]
-    //         | [1_i32, 1, 1, 1]
-    //         | [0_i32, 0, 0, 0]
-    //         | [1_i32, 0, 1, 0]
-    // );
-    // test_bool_red!(
-    //     i64x2[i64x2_boolean_reductions] | [1_i64, 1] | [0_i64, 0] | [1_i64, 0]
-    // );
-    //
-    // test_bop!(i8x16[i8; 16] | eq[i8x16_eq_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    //            [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i16x8[i16; 8] | eq[i16x8_eq_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i32x4[i32; 4] | eq[i32x4_eq_test]:
-    //           ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
-    // test_bop!(i64x2[i64; 2] | eq[i64x2_eq_test]: ([0, 1], [0, 2]) => [-1, 0]);
-    // test_bop!(f32x4[f32; 4] => i32 | eq[f32x4_eq_test]:
-    //           ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
-    // test_bop!(f64x2[f64; 2] => i64 | eq[f64x2_eq_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
-    //
-    // test_bop!(i8x16[i8; 16] | ne[i8x16_ne_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    //            [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i16x8[i16; 8] | ne[i16x8_ne_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i32x4[i32; 4] | ne[i32x4_ne_test]:
-    //           ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
-    // test_bop!(i64x2[i64; 2] | ne[i64x2_ne_test]: ([0, 1], [0, 2]) => [0, -1]);
-    // test_bop!(f32x4[f32; 4] => i32 | ne[f32x4_ne_test]:
-    //           ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
-    // test_bop!(f64x2[f64; 2] => i64 | ne[f64x2_ne_test]: ([0., 1.], [0., 2.]) => [0, -1]);
-    //
-    // test_bop!(i8x16[i8; 16] | lt[i8x16_lt_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    //            [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i16x8[i16; 8] | lt[i16x8_lt_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i32x4[i32; 4] | lt[i32x4_lt_test]:
-    //           ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
-    // test_bop!(i64x2[i64; 2] | lt[i64x2_lt_test]: ([0, 1], [0, 2]) => [0, -1]);
-    // test_bop!(f32x4[f32; 4] => i32 | lt[f32x4_lt_test]:
-    //           ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
-    // test_bop!(f64x2[f64; 2] => i64 | lt[f64x2_lt_test]: ([0., 1.], [0., 2.]) => [0, -1]);
-    //
-    // test_bop!(i8x16[i8; 16] | gt[i8x16_gt_test]:
-    //       ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
-    //        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i16x8[i16; 8] | gt[i16x8_gt_test]:
-    //           ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
-    //           [0, -1, 0, -1 ,0, -1, 0, 0]);
-    // test_bop!(i32x4[i32; 4] | gt[i32x4_gt_test]:
-    //           ([0, 2, 2, 4], [0, 1, 2, 3]) => [0, -1, 0, -1]);
-    // test_bop!(i64x2[i64; 2] | gt[i64x2_gt_test]: ([0, 2], [0, 1]) => [0, -1]);
-    // test_bop!(f32x4[f32; 4] => i32 | gt[f32x4_gt_test]:
-    //           ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [0, -1, 0, -1]);
-    // test_bop!(f64x2[f64; 2] => i64 | gt[f64x2_gt_test]: ([0., 2.], [0., 1.]) => [0, -1]);
-    //
-    // test_bop!(i8x16[i8; 16] | ge[i8x16_ge_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
-    //            [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i16x8[i16; 8] | ge[i16x8_ge_test]:
-    //           ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i32x4[i32; 4] | ge[i32x4_ge_test]:
-    //           ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
-    // test_bop!(i64x2[i64; 2] | ge[i64x2_ge_test]: ([0, 1], [0, 2]) => [-1, 0]);
-    // test_bop!(f32x4[f32; 4] => i32 | ge[f32x4_ge_test]:
-    //           ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
-    // test_bop!(f64x2[f64; 2] => i64 | ge[f64x2_ge_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
-    //
-    // test_bop!(i8x16[i8; 16] | le[i8x16_le_test]:
-    //           ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
-    //            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
-    //           ) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i16x8[i16; 8] | le[i16x8_le_test]:
-    //           ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
-    //           [-1, 0, -1, 0 ,-1, 0, -1, -1]);
-    // test_bop!(i32x4[i32; 4] | le[i32x4_le_test]:
-    //           ([0, 2, 2, 4], [0, 1, 2, 3]) => [-1, 0, -1, 0]);
-    // test_bop!(i64x2[i64; 2] | le[i64x2_le_test]: ([0, 2], [0, 1]) => [-1, 0]);
-    // test_bop!(f32x4[f32; 4] => i32 | le[f32x4_le_test]:
-    //           ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [-1, 0, -1, -0]);
-    // test_bop!(f64x2[f64; 2] => i64 | le[f64x2_le_test]: ([0., 2.], [0., 1.]) => [-1, 0]);
-    //
-    // #[wasm_bindgen_test]
-    // fn v128_bitwise_load_store() {
-    //     unsafe {
-    //         let mut arr: [i32; 4] = [0, 1, 2, 3];
-    //
-    //         let vec = v128::load(arr.as_ptr() as *const v128);
-    //         let vec = i32x4::add(vec, vec);
-    //         v128::store(arr.as_mut_ptr() as *mut v128, vec);
-    //
-    //         assert_eq!(arr, [0, 2, 4, 6]);
-    //     }
-    // }
-    //
-    // test_uop!(f32x4[f32; 4] | neg[f32x4_neg_test]: [0., 1., 2., 3.] => [ 0., -1., -2., -3.]);
-    // test_uop!(f32x4[f32; 4] | abs[f32x4_abs_test]: [0., -1., 2., -3.] => [ 0., 1., 2., 3.]);
-    // test_bop!(f32x4[f32; 4] | min[f32x4_min_test]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., -3., -4., 8.]);
-    // test_bop!(f32x4[f32; 4] | min[f32x4_min_test_nan]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
-    //           => [0., -3., -4., std::f32::NAN]);
-    // test_bop!(f32x4[f32; 4] | max[f32x4_max_test]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -1., 7., 10.]);
-    // test_bop!(f32x4[f32; 4] | max[f32x4_max_test_nan]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
-    //           => [1., -1., 7., std::f32::NAN]);
-    // test_bop!(f32x4[f32; 4] | add[f32x4_add_test]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -4., 3., 18.]);
-    // test_bop!(f32x4[f32; 4] | sub[f32x4_sub_test]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [-1., 2., 11., -2.]);
-    // test_bop!(f32x4[f32; 4] | mul[f32x4_mul_test]:
-    //           ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., 3., -28., 80.]);
-    // test_bop!(f32x4[f32; 4] | div[f32x4_div_test]:
-    //           ([0., -8., 70., 8.], [1., 4., 10., 2.]) => [0., -2., 7., 4.]);
-    //
-    // test_uop!(f64x2[f64; 2] | neg[f64x2_neg_test]: [0., 1.] => [ 0., -1.]);
-    // test_uop!(f64x2[f64; 2] | abs[f64x2_abs_test]: [0., -1.] => [ 0., 1.]);
-    // test_bop!(f64x2[f64; 2] | min[f64x2_min_test]:
-    //           ([0., -1.], [1., -3.]) => [0., -3.]);
-    // test_bop!(f64x2[f64; 2] | min[f64x2_min_test_nan]:
-    //           ([7., 8.], [-4., std::f64::NAN])
-    //           => [ -4., std::f64::NAN]);
-    // test_bop!(f64x2[f64; 2] | max[f64x2_max_test]:
-    //           ([0., -1.], [1., -3.]) => [1., -1.]);
-    // test_bop!(f64x2[f64; 2] | max[f64x2_max_test_nan]:
-    //           ([7., 8.], [ -4., std::f64::NAN])
-    //           => [7., std::f64::NAN]);
-    // test_bop!(f64x2[f64; 2] | add[f64x2_add_test]:
-    //           ([0., -1.], [1., -3.]) => [1., -4.]);
-    // test_bop!(f64x2[f64; 2] | sub[f64x2_sub_test]:
-    //           ([0., -1.], [1., -3.]) => [-1., 2.]);
-    // test_bop!(f64x2[f64; 2] | mul[f64x2_mul_test]:
-    //           ([0., -1.], [1., -3.]) => [0., 3.]);
-    // test_bop!(f64x2[f64; 2] | div[f64x2_div_test]:
-    //           ([0., -8.], [1., 4.]) => [0., -2.]);
-    //
-    // macro_rules! test_conv {
-    //     ($test_id:ident | $conv_id:ident | $to_ty:ident | $from:expr,  $to:expr) => {
-    //         #[wasm_bindgen_test]
-    //         fn $test_id() {
-    //             unsafe {
-    //                 let from: v128 = transmute($from);
-    //                 let to: v128 = transmute($to);
-    //
-    //                 let r: v128 = $to_ty::$conv_id(from);
-    //
-    //                 compare_bytes(r, to);
-    //             }
-    //         }
-    //     };
-    // }
-    //
-    // test_conv!(
-    //     f32x4_convert_s_i32x4 | convert_s_i32x4 | f32x4 | [1_i32, 2, 3, 4],
-    //     [1_f32, 2., 3., 4.]
-    // );
-    // test_conv!(
-    //     f32x4_convert_u_i32x4
-    //         | convert_u_i32x4
-    //         | f32x4
-    //         | [u32::MAX, 2, 3, 4],
-    //     [u32::MAX as f32, 2., 3., 4.]
-    // );
-    // test_conv!(
-    //     f64x2_convert_s_i64x2 | convert_s_i64x2 | f64x2 | [1_i64, 2],
-    //     [1_f64, 2.]
-    // );
+    #[test]
+    fn v8x16_shuffle() {
+        unsafe {
+            let a = [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
+            let b = [
+                16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+            ];
+
+            let vec_a: v128 = transmute(a);
+            let vec_b: v128 = transmute(b);
+
+            let vec_r = v8x16_shuffle!(
+                vec_a, vec_b, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30,
+            );
+
+            let e = [0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30];
+            let vec_e: v128 = transmute(e);
+            compare_bytes(vec_r, vec_e);
+        }
+    }
+
+    macro_rules! floating_point {
+        (f32) => {
+            true
+        };
+        (f64) => {
+            true
+        };
+        ($id:ident) => {
+            false
+        };
+    }
+
+    trait IsNan: Sized {
+        fn is_nan(self) -> bool {
+            false
+        }
+    }
+    impl IsNan for i8 {}
+    impl IsNan for i16 {}
+    impl IsNan for i32 {}
+    impl IsNan for i64 {}
+
+    macro_rules! test_bop {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $binary_op:ident [$op_test_id:ident] :
+          ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+             test_bop!(
+                 $id[$ety; $ecount] => $ety | $binary_op [ $op_test_id ]:
+                 ([$($in_a),*], [$($in_b),*]) => [$($out),*]
+             );
+
+         };
+         ($id:ident[$ety:ident; $ecount:expr] => $oty:ident |
+          $binary_op:ident [$op_test_id:ident] :
+          ([$($in_a:expr),*], [$($in_b:expr),*]) => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let b_input: [$ety; $ecount] = [$($in_b),*];
+                     let output: [$oty; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let b_vec_in: v128 = transmute(b_input);
+                     let vec_res: v128 = $binary_op(a_vec_in, b_vec_in);
+
+                     let res: [$oty; $ecount] = transmute(vec_res);
+
+                     if !floating_point!($ety) {
+                         assert_eq!(res, output);
+                     } else {
+                         for i in 0..$ecount {
+                             let r = res[i];
+                             let o = output[i];
+                             assert_eq!(r.is_nan(), o.is_nan());
+                             if !r.is_nan() {
+                                 assert_eq!(r, o);
+                             }
+                         }
+                     }
+                 }
+             }
+         }
+     }
+
+    macro_rules! test_bops {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $binary_op:ident [$op_test_id:ident]:
+          ([$($in_a:expr),*], $in_b:expr) => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let output: [$ety; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let vec_res: v128 = $binary_op(a_vec_in, $in_b);
+
+                     let res: [$ety; $ecount] = transmute(vec_res);
+                     assert_eq!(res, output);
+                 }
+             }
+         }
+     }
+
+    macro_rules! test_uop {
+         ($id:ident[$ety:ident; $ecount:expr] |
+          $unary_op:ident [$op_test_id:ident]: [$($in_a:expr),*] => [$($out:expr),*]) => {
+             #[test]
+             fn $op_test_id() {
+                 unsafe {
+                     let a_input: [$ety; $ecount] = [$($in_a),*];
+                     let output: [$ety; $ecount] = [$($out),*];
+
+                     let a_vec_in: v128 = transmute(a_input);
+                     let vec_res: v128 = $unary_op(a_vec_in);
+
+                     let res: [$ety; $ecount] = transmute(vec_res);
+                     assert_eq!(res, output);
+                 }
+             }
+         }
+     }
+
+    test_bops!(i8x16[i8; 16] | i8x16_shl[i8x16_shl_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, -2, 4, 6, 8, 10, 12, -2, 2, 2, 2, 2, 2, 2, 2, 2]);
+    test_bops!(i16x8[i16; 8] | i16x8_shl[i16x8_shl_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+                [0, -2, 4, 6, 8, 10, 12, -2]);
+    test_bops!(i32x4[i32; 4] | i32x4_shl[i32x4_shl_test]:
+                ([0, -1, 2, 3], 1) => [0, -2, 4, 6]);
+    test_bops!(i64x2[i64; 2] | i64x2_shl[i64x2_shl_test]:
+                ([0, -1], 1) => [0, -2]);
+
+    test_bops!(i8x16[i8; 16] | i8x16_shr_s[i8x16_shr_s_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+               [0, -1, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bops!(i16x8[i16; 8] | i16x8_shr_s[i16x8_shr_s_test]:
+               ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+               [0, -1, 1, 1, 2, 2, 3, i16::MAX / 2]);
+    test_bops!(i32x4[i32; 4] | i32x4_shr_s[i32x4_shr_s_test]:
+               ([0, -1, 2, 3], 1) => [0, -1, 1, 1]);
+    test_bops!(i64x2[i64; 2] | i64x2_shr_s[i64x2_shr_s_test]:
+               ([0, -1], 1) => [0, -1]);
+
+    test_bops!(i8x16[i8; 16] | i8x16_shr_u[i8x16_uhr_u_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i8::MAX, 1, 1, 1, 1, 1, 1, 1, 1], 1) =>
+                [0, i8::MAX, 1, 1, 2, 2, 3, 63, 0, 0, 0, 0, 0, 0, 0, 0]);
+    test_bops!(i16x8[i16; 8] | i16x8_shr_u[i16x8_uhr_u_test]:
+                ([0, -1, 2, 3, 4, 5, 6, i16::MAX], 1) =>
+                [0, i16::MAX, 1, 1, 2, 2, 3, i16::MAX / 2]);
+    test_bops!(i32x4[i32; 4] | i32x4_shr_u[i32x4_uhr_u_test]:
+                ([0, -1, 2, 3], 1) => [0, i32::MAX, 1, 1]);
+    test_bops!(i64x2[i64; 2] | i64x2_shr_u[i64x2_uhr_u_test]:
+                ([0, -1], 1) => [0, i64::MAX]);
+
+    #[test]
+    fn v128_bitwise_logical_ops() {
+        unsafe {
+            let a: [u32; 4] = [u32::MAX, 0, u32::MAX, 0];
+            let b: [u32; 4] = [u32::MAX; 4];
+            let c: [u32; 4] = [0; 4];
+
+            let vec_a: v128 = transmute(a);
+            let vec_b: v128 = transmute(b);
+            let vec_c: v128 = transmute(c);
+
+            let r: v128 = v128_and(vec_a, vec_a);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128_and(vec_a, vec_b);
+            compare_bytes(r, vec_a);
+            let r: v128 = v128_or(vec_a, vec_b);
+            compare_bytes(r, vec_b);
+            let r: v128 = v128_not(vec_b);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_xor(vec_a, vec_c);
+            compare_bytes(r, vec_a);
+
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_b);
+            compare_bytes(r, vec_b);
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_c);
+            compare_bytes(r, vec_c);
+            let r: v128 = v128_bitselect(vec_b, vec_c, vec_a);
+            compare_bytes(r, vec_a);
+        }
+    }
+
+    macro_rules! test_bool_red {
+         ([$test_id:ident, $any:ident, $all:ident] | [$($true:expr),*] | [$($false:expr),*] | [$($alt:expr),*]) => {
+             #[test]
+             fn $test_id() {
+                 unsafe {
+                     let vec_a: v128 = transmute([$($true),*]); // true
+                     let vec_b: v128 = transmute([$($false),*]); // false
+                     let vec_c: v128 = transmute([$($alt),*]); // alternating
+
+                     assert_eq!($any(vec_a), 1);
+                     assert_eq!($any(vec_b), 0);
+                     assert_eq!($any(vec_c), 1);
+
+                     assert_eq!($all(vec_a), 1);
+                     assert_eq!($all(vec_b), 0);
+                     assert_eq!($all(vec_c), 0);
+                 }
+             }
+         }
+     }
+
+    test_bool_red!(
+        [i8x16_boolean_reductions, i8x16_any_true, i8x16_all_true]
+            | [1_i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+            | [0_i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+            | [1_i8, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
+    );
+    test_bool_red!(
+        [i16x8_boolean_reductions, i16x8_any_true, i16x8_all_true]
+            | [1_i16, 1, 1, 1, 1, 1, 1, 1]
+            | [0_i16, 0, 0, 0, 0, 0, 0, 0]
+            | [1_i16, 0, 1, 0, 1, 0, 1, 0]
+    );
+    test_bool_red!(
+        [i32x4_boolean_reductions, i32x4_any_true, i32x4_all_true]
+            | [1_i32, 1, 1, 1]
+            | [0_i32, 0, 0, 0]
+            | [1_i32, 0, 1, 0]
+    );
+
+    test_bop!(i8x16[i8; 16] | i8x16_eq[i8x16_eq_test]:
+              ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+               [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+              [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_eq[i16x8_eq_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_eq[i32x4_eq_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_eq[f32x4_eq_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_eq[f64x2_eq_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_ne[i8x16_ne_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_ne[i16x8_ne_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_ne[i32x4_ne_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_ne[f32x4_ne_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_ne[f64x2_ne_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_lt_s[i8x16_lt_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_lt_s[i16x8_lt_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_lt_s[i32x4_lt_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [0, -1, 0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_lt[f32x4_lt_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_lt[f64x2_lt_test]: ([0., 1.], [0., 2.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_gt_s[i8x16_gt_test]:
+           ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0, 0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i16x8[i16; 8] | i16x8_gt_s[i16x8_gt_test]:
+               ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [0, -1, 0, -1 ,0, -1, 0, 0]);
+    test_bop!(i32x4[i32; 4] | i32x4_gt_s[i32x4_gt_test]:
+               ([0, 2, 2, 4], [0, 1, 2, 3]) => [0, -1, 0, -1]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_gt[f32x4_gt_test]:
+               ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [0, -1, 0, -1]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_gt[f64x2_gt_test]: ([0., 2.], [0., 1.]) => [0, -1]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_ge_s[i8x16_ge_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+                [0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_ge_s[i16x8_ge_test]:
+               ([0, 1, 2, 3, 4, 5, 6, 7], [0, 2, 2, 4, 4, 6, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_ge_s[i32x4_ge_test]:
+               ([0, 1, 2, 3], [0, 2, 2, 4]) => [-1, 0, -1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_ge[f32x4_ge_test]:
+               ([0., 1., 2., 3.], [0., 2., 2., 4.]) => [-1, 0, -1, 0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_ge[f64x2_ge_test]: ([0., 1.], [0., 2.]) => [-1, 0]);
+
+    test_bop!(i8x16[i8; 16] | i8x16_le_s[i8x16_le_test]:
+               ([0, 2, 2, 4, 4, 6, 6, 7, 8, 10, 10, 12, 12, 14, 14, 15],
+                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
+               ) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1, -1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i16x8[i16; 8] | i16x8_le_s[i16x8_le_test]:
+               ([0, 2, 2, 4, 4, 6, 6, 7], [0, 1, 2, 3, 4, 5, 6, 7]) =>
+               [-1, 0, -1, 0 ,-1, 0, -1, -1]);
+    test_bop!(i32x4[i32; 4] | i32x4_le_s[i32x4_le_test]:
+               ([0, 2, 2, 4], [0, 1, 2, 3]) => [-1, 0, -1, 0]);
+    test_bop!(f32x4[f32; 4] => i32 | f32x4_le[f32x4_le_test]:
+               ([0., 2., 2., 4.], [0., 1., 2., 3.]) => [-1, 0, -1, -0]);
+    test_bop!(f64x2[f64; 2] => i64 | f64x2_le[f64x2_le_test]: ([0., 2.], [0., 1.]) => [-1, 0]);
+
+    #[test]
+    fn v128_bitwise_load_store() {
+        unsafe {
+            let mut arr: [i32; 4] = [0, 1, 2, 3];
+
+            let vec = v128_load(arr.as_ptr() as *const v128);
+            let vec = i32x4_add(vec, vec);
+            v128_store(arr.as_mut_ptr() as *mut v128, vec);
+
+            assert_eq!(arr, [0, 2, 4, 6]);
+        }
+    }
+
+    test_uop!(f32x4[f32; 4] | f32x4_neg[f32x4_neg_test]: [0., 1., 2., 3.] => [ 0., -1., -2., -3.]);
+    test_uop!(f32x4[f32; 4] | f32x4_abs[f32x4_abs_test]: [0., -1., 2., -3.] => [ 0., 1., 2., 3.]);
+    test_bop!(f32x4[f32; 4] | f32x4_min[f32x4_min_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., -3., -4., 8.]);
+    test_bop!(f32x4[f32; 4] | f32x4_min[f32x4_min_test_nan]:
+              ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
+              => [0., -3., -4., std::f32::NAN]);
+    test_bop!(f32x4[f32; 4] | f32x4_max[f32x4_max_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -1., 7., 10.]);
+    test_bop!(f32x4[f32; 4] | f32x4_max[f32x4_max_test_nan]:
+              ([0., -1., 7., 8.], [1., -3., -4., std::f32::NAN])
+              => [1., -1., 7., std::f32::NAN]);
+    test_bop!(f32x4[f32; 4] | f32x4_add[f32x4_add_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [1., -4., 3., 18.]);
+    test_bop!(f32x4[f32; 4] | f32x4_sub[f32x4_sub_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [-1., 2., 11., -2.]);
+    test_bop!(f32x4[f32; 4] | f32x4_mul[f32x4_mul_test]:
+              ([0., -1., 7., 8.], [1., -3., -4., 10.]) => [0., 3., -28., 80.]);
+    test_bop!(f32x4[f32; 4] | f32x4_div[f32x4_div_test]:
+              ([0., -8., 70., 8.], [1., 4., 10., 2.]) => [0., -2., 7., 4.]);
+
+    test_uop!(f64x2[f64; 2] | f64x2_neg[f64x2_neg_test]: [0., 1.] => [ 0., -1.]);
+    test_uop!(f64x2[f64; 2] | f64x2_abs[f64x2_abs_test]: [0., -1.] => [ 0., 1.]);
+    test_bop!(f64x2[f64; 2] | f64x2_min[f64x2_min_test]:
+               ([0., -1.], [1., -3.]) => [0., -3.]);
+    test_bop!(f64x2[f64; 2] | f64x2_min[f64x2_min_test_nan]:
+               ([7., 8.], [-4., std::f64::NAN])
+               => [ -4., std::f64::NAN]);
+    test_bop!(f64x2[f64; 2] | f64x2_max[f64x2_max_test]:
+               ([0., -1.], [1., -3.]) => [1., -1.]);
+    test_bop!(f64x2[f64; 2] | f64x2_max[f64x2_max_test_nan]:
+               ([7., 8.], [ -4., std::f64::NAN])
+               => [7., std::f64::NAN]);
+    test_bop!(f64x2[f64; 2] | f64x2_add[f64x2_add_test]:
+               ([0., -1.], [1., -3.]) => [1., -4.]);
+    test_bop!(f64x2[f64; 2] | f64x2_sub[f64x2_sub_test]:
+               ([0., -1.], [1., -3.]) => [-1., 2.]);
+    test_bop!(f64x2[f64; 2] | f64x2_mul[f64x2_mul_test]:
+               ([0., -1.], [1., -3.]) => [0., 3.]);
+    test_bop!(f64x2[f64; 2] | f64x2_div[f64x2_div_test]:
+               ([0., -8.], [1., 4.]) => [0., -2.]);
+
+    macro_rules! test_conv {
+        ($test_id:ident | $conv_id:ident | $to_ty:ident | $from:expr,  $to:expr) => {
+            #[test]
+            fn $test_id() {
+                unsafe {
+                    let from: v128 = transmute($from);
+                    let to: v128 = transmute($to);
+
+                    let r: v128 = $conv_id(from);
+
+                    compare_bytes(r, to);
+                }
+            }
+        };
+    }
+
+    test_conv!(
+        f32x4_convert_s_i32x4 | f32x4_convert_i32x4_s | f32x4 | [1_i32, 2, 3, 4],
+        [1_f32, 2., 3., 4.]
+    );
+    test_conv!(
+        f32x4_convert_u_i32x4 | f32x4_convert_i32x4_u | f32x4 | [u32::MAX, 2, 3, 4],
+        [u32::MAX as f32, 2., 3., 4.]
+    );
+
+    // FIXME: this fails, and produces 0 instead of saturating at i32::MAX
     // test_conv!(
-    //     f64x2_convert_u_i64x2
-    //         | convert_u_i64x2
-    //         | f64x2
-    //         | [u64::MAX, 2],
-    //     [18446744073709552000.0, 2.]
+    //     i32x4_trunc_s_f32x4_sat
+    //         | i32x4_trunc_sat_f32x4_s
+    //         | i32x4
+    //         | [f32::NAN, 2., (i32::MAX as f32 + 1.), 4.],
+    //     [0, 2, i32::MAX, 4]
     // );
-    //
-    // // FIXME: this fails, and produces -2147483648 instead of saturating at
-    // // i32::MAX test_conv!(i32x4_trunc_s_f32x4_sat | trunc_s_f32x4_sat
-    // // | i32x4 | [1_f32, 2., (i32::MAX as f32 + 1.), 4.],
-    // // [1_i32, 2, i32::MAX, 4]); FIXME: add other saturating tests
+    // FIXME: add other saturating tests
 }
diff --git a/crates/core_arch/tests/xcrate-macros.rs b/crates/core_arch/tests/xcrate-macros.rs
new file mode 100644
index 0000000000..1b32a6c70d
--- /dev/null
+++ b/crates/core_arch/tests/xcrate-macros.rs
@@ -0,0 +1,18 @@
+#![feature(stdsimd)]
+
+#[test]
+#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+fn wut() {
+    use core_arch::arch::wasm32;
+    let a = wasm32::v128_const(0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+    let b = wasm32::v128_const(
+        16_u8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    );
+
+    let vec_r = unsafe {
+        wasm32::v8x16_shuffle!(a, b, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30,)
+    };
+
+    let e = wasm32::v128_const(0_u8, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30);
+    assert_eq!(wasm32::i8x16_all_true(wasm32::i8x16_eq(e, vec_r)), 1);
+}
diff --git a/crates/std_detect/src/detect/mod.rs b/crates/std_detect/src/detect/mod.rs
index 77d1f7c506..c44f44c1b3 100644
--- a/crates/std_detect/src/detect/mod.rs
+++ b/crates/std_detect/src/detect/mod.rs
@@ -56,6 +56,7 @@ cfg_if! {
         mod arch;
     } else {
         // Unimplemented architecture:
+        #[allow(dead_code)]
         mod arch {
             #[doc(hidden)]
             pub(crate) enum Feature {
@@ -117,6 +118,7 @@ cfg_if! {
 
 /// Performs run-time feature detection.
 #[inline]
+#[allow(dead_code)]
 fn check_for(x: Feature) -> bool {
     cache::test(x as u32, self::os::detect_features)
 }
diff --git a/crates/std_detect/src/detect/os/other.rs b/crates/std_detect/src/detect/os/other.rs
index bf7be87f07..091fafc4eb 100644
--- a/crates/std_detect/src/detect/os/other.rs
+++ b/crates/std_detect/src/detect/os/other.rs
@@ -2,6 +2,7 @@
 
 use crate::detect::cache;
 
+#[allow(dead_code)]
 pub(crate) fn detect_features() -> cache::Initializer {
     cache::Initializer::default()
 }
diff --git a/crates/stdarch-test/Cargo.toml b/crates/stdarch-test/Cargo.toml
index 2b445f8dc5..2fc42db92a 100644
--- a/crates/stdarch-test/Cargo.toml
+++ b/crates/stdarch-test/Cargo.toml
@@ -11,10 +11,8 @@ lazy_static = "1.0"
 rustc-demangle = "0.1.8"
 cfg-if = "0.1"
 
-[target.wasm32-unknown-unknown.dependencies]
-wasm-bindgen = "0.2.47"
-js-sys = "0.3"
-console_error_panic_hook = "0.1"
+[target.'cfg(target_arch = "wasm32")'.dependencies]
+wasmprinter = "0.2.6"
 
 [features]
 default = []
diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs
index fa73a7bba6..c66b6a8d9d 100644
--- a/crates/stdarch-test/src/lib.rs
+++ b/crates/stdarch-test/src/lib.rs
@@ -3,7 +3,6 @@
 //! This basically just disassembles the current executable and then parses the
 //! output once globally and then provides the `assert` function which makes
 //! assertions about the disassembly of a function.
-#![feature(const_transmute)]
 #![feature(vec_leak)]
 #![allow(clippy::missing_docs_in_private_items, clippy::print_stdout)]
 
@@ -20,19 +19,8 @@ pub use assert_instr_macro::*;
 pub use simd_test_macro::*;
 use std::{cmp, collections::HashSet, env, hash, str, sync::atomic::AtomicPtr};
 
-// `println!` doesn't work on wasm32 right now, so shadow the compiler's `println!`
-// macro with our own shim that redirects to `console.log`.
-#[allow(unused)]
-#[cfg(target_arch = "wasm32")]
-#[macro_export]
-macro_rules! println {
-    ($($args:tt)*) => (crate::wasm::js_console_log(&format!($($args)*)))
-}
-
 cfg_if! {
     if #[cfg(target_arch = "wasm32")] {
-        extern crate wasm_bindgen;
-        extern crate console_error_panic_hook;
         pub mod wasm;
         use wasm::disassemble_myself;
     } else {
diff --git a/crates/stdarch-test/src/wasm.rs b/crates/stdarch-test/src/wasm.rs
index 612ff10d90..bf411c1214 100644
--- a/crates/stdarch-test/src/wasm.rs
+++ b/crates/stdarch-test/src/wasm.rs
@@ -1,49 +1,17 @@
 //! Disassembly calling function for `wasm32` targets.
-use wasm_bindgen::prelude::*;
 
 use crate::Function;
 use std::collections::HashSet;
 
-#[wasm_bindgen(module = "child_process")]
-extern "C" {
-    #[wasm_bindgen(js_name = execFileSync)]
-    fn exec_file_sync(cmd: &str, args: &js_sys::Array, opts: &js_sys::Object) -> Buffer;
-}
-
-#[wasm_bindgen(module = "buffer")]
-extern "C" {
-    type Buffer;
-    #[wasm_bindgen(method, js_name = toString)]
-    fn to_string(this: &Buffer) -> String;
-}
-
-#[wasm_bindgen]
-extern "C" {
-    #[wasm_bindgen(js_namespace = require)]
-    fn resolve(module: &str) -> String;
-    #[wasm_bindgen(js_namespace = console, js_name = log)]
-    pub fn js_console_log(s: &str);
-}
-
 pub(crate) fn disassemble_myself() -> HashSet<Function> {
-    use std::path::Path;
-    ::console_error_panic_hook::set_once();
-    // Our wasm module in the wasm-bindgen test harness is called
-    // "wasm-bindgen-test_bg". When running in node this is actually a shim JS
-    // file. Ask node where that JS file is, and then we use that with a wasm
-    // extension to find the wasm file itself.
-    let js_shim = resolve("wasm-bindgen-test");
-    let js_shim = Path::new(&js_shim).with_file_name("wasm-bindgen-test_bg.wasm");
-
-    // Execute `wasm2wat` synchronously, waiting for and capturing all of its
-    // output. Note that we pass in a custom `maxBuffer` parameter because we're
-    // generating a ton of output that needs to be buffered.
-    let args = js_sys::Array::new();
-    args.push(&js_shim.display().to_string().into());
-    args.push(&"--enable-simd".into());
-    let opts = js_sys::Object::new();
-    js_sys::Reflect::set(&opts, &"maxBuffer".into(), &(200 * 1024 * 1024).into()).unwrap();
-    let output = exec_file_sync("wasm2wat", &args, &opts).to_string();
+    // Use `std::env::args` to find the path to our executable. Assume the
+    // environment is configured such that we can read that file. Read it and
+    // use the `wasmprinter` crate to transform the binary to text, then search
+    // the text for appropriately named functions.
+    let me = std::env::args()
+        .next()
+        .expect("failed to find current wasm file");
+    let output = wasmprinter::print_file(&me).unwrap();
 
     let mut ret: HashSet<Function> = HashSet::new();
     let mut lines = output.lines().map(|s| s.trim());
diff --git a/examples/Cargo.toml b/examples/Cargo.toml
index 6f00d46230..72599b4182 100644
--- a/examples/Cargo.toml
+++ b/examples/Cargo.toml
@@ -7,16 +7,14 @@ authors = [
     "Gonzalo Brito Gadeschi <gonzalobg88@gmail.com>",
 ]
 description = "Examples of the stdarch crate."
+edition = "2018"
 
 [dependencies]
 core_arch = { path = "../crates/core_arch" }
 std_detect = { path = "../crates/std_detect" }
-quickcheck = "0.8"
+quickcheck = "0.9"
 rand = "0.7"
 
-[target.'cfg(target_arch = "wasm32")'.dependencies]
-rand = { version = "0.6", features = ["wasm-bindgen"] }
-
 [[bin]]
 name = "hex"
 path = "hex.rs"
diff --git a/examples/hex.rs b/examples/hex.rs
index b3d6fb0786..d9818d03e5 100644
--- a/examples/hex.rs
+++ b/examples/hex.rs
@@ -25,25 +25,15 @@
     clippy::missing_docs_in_private_items
 )]
 
-#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-#[macro_use(is_x86_feature_detected)]
-extern crate std_detect;
-
-extern crate core_arch;
-
-#[cfg(test)]
-#[macro_use]
-extern crate quickcheck;
-
 use std::{
     io::{self, Read},
     str,
 };
 
 #[cfg(target_arch = "x86")]
-use core_arch::x86::*;
+use {core_arch::arch::x86::*, std_detect::is_x86_feature_detected};
 #[cfg(target_arch = "x86_64")]
-use core_arch::x86_64::*;
+use {core_arch::arch::x86_64::*, std_detect::is_x86_feature_detected};
 
 fn main() {
     let mut input = Vec::new();
@@ -68,6 +58,12 @@ fn hex_encode<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
             return unsafe { hex_encode_sse41(src, dst) };
         }
     }
+    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+    {
+        if true {
+            return unsafe { hex_encode_simd128(src, dst) };
+        }
+    }
 
     hex_encode_fallback(src, dst)
 }
@@ -157,6 +153,53 @@ unsafe fn hex_encode_sse41<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a
     Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
 }
 
+#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+unsafe fn hex_encode_simd128<'a>(mut src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
+    use core_arch::arch::wasm32::*;
+
+    let ascii_zero = i8x16_splat(b'0' as i8);
+    let nines = i8x16_splat(9);
+    let ascii_a = i8x16_splat((b'a' - 9 - 1) as i8);
+    let and4bits = i8x16_splat(0xf);
+
+    let mut i = 0_isize;
+    while src.len() >= 16 {
+        let invec = v128_load(src.as_ptr() as *const _);
+
+        let masked1 = v128_and(invec, and4bits);
+        let masked2 = v128_and(i8x16_shr_u(invec, 4), and4bits);
+
+        // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+        let cmpmask1 = i8x16_gt_u(masked1, nines);
+        let cmpmask2 = i8x16_gt_u(masked2, nines);
+
+        // add '0' or the offset depending on the masks
+        let masked1 = i8x16_add(masked1, v128_bitselect(ascii_a, ascii_zero, cmpmask1));
+        let masked2 = i8x16_add(masked2, v128_bitselect(ascii_a, ascii_zero, cmpmask2));
+
+        // Next we need to shuffle around masked{1,2} to get back to the
+        // original source text order. The first element (res1) we'll store uses
+        // all the low bytes from the 2 masks and the second element (res2) uses
+        // all the upper bytes.
+        let res1 = v8x16_shuffle!(
+            masked2, masked1, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23,
+        );
+        let res2 = v8x16_shuffle!(
+            masked2, masked1, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31,
+        );
+
+        v128_store(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
+        v128_store(dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, res2);
+        src = &src[16..];
+        i += 16;
+    }
+
+    let i = i as usize;
+    let _ = hex_encode_fallback(src, &mut dst[i * 2..]);
+
+    Ok(str::from_utf8_unchecked(&dst[..src.len() * 2 + i * 2]))
+}
+
 fn hex_encode_fallback<'a>(src: &[u8], dst: &'a mut [u8]) -> Result<&'a str, usize> {
     fn hex(byte: u8) -> u8 {
         static TABLE: &[u8] = b"0123456789abcdef";
@@ -186,10 +229,10 @@ mod tests {
 
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         unsafe {
-            if is_x86_feature_detected!("avx2") {
+            if self::is_x86_feature_detected!("avx2") {
                 assert_eq!(hex_encode_avx2(input, &mut tmp()).unwrap(), output);
             }
-            if is_x86_feature_detected!("sse4.1") {
+            if self::is_x86_feature_detected!("sse4.1") {
                 assert_eq!(hex_encode_sse41(input, &mut tmp()).unwrap(), output);
             }
         }
@@ -236,7 +279,7 @@ mod tests {
         );
     }
 
-    quickcheck! {
+    quickcheck::quickcheck! {
         fn encode_equals_fallback(input: Vec<u8>) -> bool {
             let mut space1 = vec![0; input.len() * 2];
             let mut space2 = vec![0; input.len() * 2];
@@ -247,7 +290,7 @@ mod tests {
 
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         fn avx_equals_fallback(input: Vec<u8>) -> bool {
-            if !is_x86_feature_detected!("avx2") {
+            if !self::is_x86_feature_detected!("avx2") {
                 return true
             }
             let mut space1 = vec![0; input.len() * 2];
@@ -259,7 +302,7 @@ mod tests {
 
         #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
         fn sse41_equals_fallback(input: Vec<u8>) -> bool {
-            if !is_x86_feature_detected!("avx2") {
+            if !self::is_x86_feature_detected!("avx2") {
                 return true
             }
             let mut space1 = vec![0; input.len() * 2];
@@ -328,28 +371,28 @@ mod benches {
 
         #[bench]
         fn small_avx2(b: &mut test::Bencher) {
-            if is_x86_feature_detected!("avx2") {
+            if self::is_x86_feature_detected!("avx2") {
                 doit(b, SMALL_LEN, hex_encode_avx2);
             }
         }
 
         #[bench]
         fn small_sse41(b: &mut test::Bencher) {
-            if is_x86_feature_detected!("sse4.1") {
+            if self::is_x86_feature_detected!("sse4.1") {
                 doit(b, SMALL_LEN, hex_encode_sse41);
             }
         }
 
         #[bench]
         fn large_avx2(b: &mut test::Bencher) {
-            if is_x86_feature_detected!("avx2") {
+            if self::is_x86_feature_detected!("avx2") {
                 doit(b, LARGE_LEN, hex_encode_avx2);
             }
         }
 
         #[bench]
         fn large_sse41(b: &mut test::Bencher) {
-            if is_x86_feature_detected!("sse4.1") {
+            if self::is_x86_feature_detected!("sse4.1") {
                 doit(b, LARGE_LEN, hex_encode_sse41);
             }
         }
diff --git a/examples/wasm.rs b/examples/wasm.rs
index 53f9c55d4e..6b92ae9b87 100644
--- a/examples/wasm.rs
+++ b/examples/wasm.rs
@@ -3,11 +3,9 @@
 #![feature(stdsimd)]
 #![cfg(target_arch = "wasm32")]
 
-extern crate core_arch;
-
 use std::ptr;
 
-use core_arch::wasm32::*;
+use core_arch::arch::wasm32::*;
 
 static mut HEAD: *mut *mut u8 = 0 as _;