diff --git a/.travis.yml b/.travis.yml
index 2b60f12..db4ed82 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,21 +1,49 @@
 language: rust
 sudo: false
 
-# run builds for all the trains (and more)
-rust:
-  - 1.12.0
-  - stable
-  - beta
-  - nightly
+matrix:
+  include:
+    - rust: 1.28.0
+      env:
+        TARGET=x86_64-unknown-linux-gnu
+    - rust: stable
+      env:
+        TARGET=x86_64-unknown-linux-gnu
+    - rust: stable
+      env:
+        TARGET=i686-unknown-linux-gnu
+    - rust: beta
+      env:
+        TARGET=x86_64-unknown-linux-gnu
+    - rust: nightly
+      env:
+        TARGET=x86_64-unknown-linux-gnu
+    - rust: nightly
+      env:
+        TARGET=aarch64-unknown-linux-gnu
+        BUILD_ONLY=1
+env:
+  global:
+    - HOST=x86_64-unknown-linux-gnu
+
+addons:
+  apt:
+    packages:
+      # needed for i686-unknown-linux-gnu target
+      - gcc-multilib
+install:
+  # "rustup error: cannot re-add" without this conditional check
+- if [[ $HOST != $TARGET ]]; then rustup target add $TARGET; fi
 
 # the main build
 script:
   - |
-      cargo build &&
-      cargo test &&
-      cargo test --release &&
-      cargo doc &&
-      cargo bench
+      cargo build --target=$TARGET &&
+      ([ -n "$BUILD_ONLY" ] || (
+      cargo test --target=$TARGET &&
+      cargo test --release --target=$TARGET &&
+      cargo doc --target=$TARGET &&
+      cargo bench --target=$TARGET ))
 
 branches:
   only:
diff --git a/Cargo.toml b/Cargo.toml
index 28e8a65..bf95480 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -12,8 +12,6 @@ description = "General matrix multiplication of f32 and f64 matrices in Rust. Su
 
 keywords = ["matrix", "sgemm", "dgemm"]
 
-build = "build.rs"
-
 [lib]
 bench = false
 
diff --git a/LICENSE-MIT b/LICENSE-MIT
index 9203baa..e2d8219 100644
--- a/LICENSE-MIT
+++ b/LICENSE-MIT
@@ -1,4 +1,4 @@
-Copyright (c) 2015
+Copyright (c) 2016 - 2018 Ulrik Sverdrup "bluss"
 
 Permission is hereby granted, free of charge, to any
 person obtaining a copy of this software and associated
diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
index 2df0187..a3bd3fd 100644
--- a/benches/benchmarks.rs
+++ b/benches/benchmarks.rs
@@ -40,43 +40,44 @@ macro_rules! mat_mul {
     };
 }
 
-benchmark_main!(mat_mul_f32, mat_mul_f64, ref_mat_mul_f32);
+benchmark_main!(mat_mul_f32, mat_mul_f64);
 
 mat_mul!{mat_mul_f32, sgemm,
     (m004, 4, 4, 4)
-    (m005, 5, 5, 5)
     (m006, 6, 6, 6)
-    (m007, 7, 7, 7)
     (m008, 8, 8, 8)
-    (m009, 9, 9, 9)
     (m012, 12, 12, 12)
     (m016, 16, 16, 16)
     (m032, 32, 32, 32)
     (m064, 64, 64, 64)
     (m127, 127, 127, 127)
+    /*
     (m256, 256, 256, 256)
     (m512, 512, 512, 512)
     (mix16x4, 32, 4, 32)
     (mix32x2, 32, 2, 32)
     (mix97, 97, 97, 125)
     (mix128x10000x128, 128, 10000, 128)
+    */
 }
 
 mat_mul!{mat_mul_f64, dgemm,
     (m004, 4, 4, 4)
-    (m007, 7, 7, 7)
+    (m006, 6, 6, 6)
     (m008, 8, 8, 8)
     (m012, 12, 12, 12)
     (m016, 16, 16, 16)
     (m032, 32, 32, 32)
     (m064, 64, 64, 64)
     (m127, 127, 127, 127)
+    /*
     (m256, 256, 256, 256)
     (m512, 512, 512, 512)
     (mix16x4, 32, 4, 32)
     (mix32x2, 32, 2, 32)
     (mix97, 97, 97, 125)
     (mix128x10000x128, 128, 10000, 128)
+    */
 }
 
 use std::ops::{Add, Mul};
diff --git a/blas-bench/Cargo.toml b/blas-bench/Cargo.toml
new file mode 100644
index 0000000..d6ce102
--- /dev/null
+++ b/blas-bench/Cargo.toml
@@ -0,0 +1,31 @@
+[package]
+name = "blas-bench"
+version = "0.1.0"
+authors = ["bluss"]
+publish = false
+
+license = "MIT/Apache-2.0"
+
+repository = "https://github.com/bluss/matrixmultiply/"
+documentation = ""
+
+description = "Blas benchmarks for comparison with matrixmultiply"
+
+keywords = ["matrix", "sgemm", "dgemm"]
+
+[lib]
+bench = false
+
+[[bench]]
+name = "benchmarks"
+harness = false
+
+[dependencies]
+rawpointer = "0.1"
+matrixmultiply = { path = ".." }
+blas = { version = "0.20", default-features = false }
+blas-src = { version = "0.2.0", default-features = false }
+
+
+[dev-dependencies]
+bencher = "0.1.2"
diff --git a/blas-bench/README.md b/blas-bench/README.md
new file mode 100644
index 0000000..806583e
--- /dev/null
+++ b/blas-bench/README.md
@@ -0,0 +1,7 @@
+
+Run BLAS benchmarks to compare with matrixmultiply.
+
+These tests are set up to run vs a system-installed openblas (see the build.rs file),
+because building all of openblas just to benchmark versus it is tedious.
+So make sure openblas is installed, or other library that supports the cblas interface,
+and tweak the build.rs file to suit.
diff --git a/blas-bench/benches/benchmarks.rs b/blas-bench/benches/benchmarks.rs
new file mode 100644
index 0000000..91fda89
--- /dev/null
+++ b/blas-bench/benches/benchmarks.rs
@@ -0,0 +1,81 @@
+extern crate blas_bench;
+extern crate matrixmultiply;
+pub use matrixmultiply::sgemm;
+pub use matrixmultiply::dgemm;
+
+#[macro_use]
+extern crate bencher;
+extern crate blas;
+
+use std::os::raw::c_int;
+
+
+#[allow(non_camel_case_types)]
+type blas_index = c_int; // blas index type
+
+
+// Compute GFlop/s
+// by flop / s = 2 M N K / time
+
+
+benchmark_main!(blas_mat_mul_f32, blas_mat_mul_f64);
+
+macro_rules! blas_mat_mul {
+    ($modname:ident, $gemm:ident, $(($name:ident, $m:expr, $n:expr, $k:expr))+) => {
+        mod $modname {
+            use bencher::{Bencher};
+            use super::blas_index;
+            $(
+            pub fn $name(bench: &mut Bencher)
+            {
+                let a = vec![0.; $m * $n]; 
+                let b = vec![0.; $n * $k];
+                let mut c = vec![0.; $m * $k];
+                bench.iter(|| {
+                    unsafe {
+
+                            blas::$gemm(
+                            b'N',
+                            b'N',
+                            $m as blas_index, // m, rows of Op(a)
+                            $n as blas_index, // n, cols of Op(b)
+                            $k as blas_index, // k, cols of Op(a)
+                            1.,
+                            &a,
+                            $n, // lda
+                            &b,
+                            $k, // ldb
+                            0.,         // beta
+                            &mut c,
+                            $k, // ldc
+                            );
+                    }
+                });
+            }
+            )+
+        }
+        benchmark_group!{ $modname, $($modname::$name),+ }
+    };
+}
+
+blas_mat_mul!{blas_mat_mul_f32, sgemm,
+    (m004, 4, 4, 4)
+    (m006, 6, 6, 6)
+    (m008, 8, 8, 8)
+    (m012, 12, 12, 12)
+    (m016, 16, 16, 16)
+    (m032, 32, 32, 32)
+    (m064, 64, 64, 64)
+   (m127, 127, 127, 127)
+}
+
+blas_mat_mul!{blas_mat_mul_f64, dgemm,
+    (m004, 4, 4, 4)
+    (m006, 6, 6, 6)
+    (m008, 8, 8, 8)
+    (m012, 12, 12, 12)
+    (m016, 16, 16, 16)
+    (m032, 32, 32, 32)
+    (m064, 64, 64, 64)
+   (m127, 127, 127, 127)
+}
diff --git a/blas-bench/build.rs b/blas-bench/build.rs
new file mode 100644
index 0000000..9bd617a
--- /dev/null
+++ b/blas-bench/build.rs
@@ -0,0 +1,12 @@
+
+///
+/// This build script emits the openblas linking directive if requested
+///
+
+fn main() {
+    // Always linking openblas
+    // Compiling blas just for testing is tedious -- install it on your system
+    // and run this.
+    println!("cargo:rerun-if-changed=build.rs");
+    println!("cargo:rustc-link-lib={}=openblas", "dylib");
+}
diff --git a/blas-bench/src/lib.rs b/blas-bench/src/lib.rs
new file mode 100644
index 0000000..31e1bb2
--- /dev/null
+++ b/blas-bench/src/lib.rs
@@ -0,0 +1,7 @@
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn it_works() {
+        assert_eq!(2 + 2, 4);
+    }
+}
diff --git a/build.rs b/build.rs
deleted file mode 100644
index 32f833c..0000000
--- a/build.rs
+++ /dev/null
@@ -1,14 +0,0 @@
-
-//!
-
-use std::env;
-
-fn main() {
-    println!("cargo:rerun-if-changed=build.rs");
-
-    if let Ok(features) = env::var("CARGO_CFG_TARGET_FEATURE") {
-        if features.split(",").map(|s| s.trim()).any(|feat| feat == "avx") {
-            println!("cargo:rustc-cfg=sgemm_8x8");
-        }
-    }
-}
diff --git a/spare kernels/x86_sse_sgemm.rs b/spare kernels/x86_sse_sgemm.rs
new file mode 100644
index 0000000..720c93c
--- /dev/null
+++ b/spare kernels/x86_sse_sgemm.rs	
@@ -0,0 +1,84 @@
+
+// 4x4 sse sgemm
+macro_rules! mm_transpose4 {
+    ($c0:expr, $c1:expr, $c2:expr, $c3:expr) => {{
+        // This is _MM_TRANSPOSE4_PS except we take variables, not references
+        let tmp0 = _mm_unpacklo_ps($c0, $c1);
+        let tmp2 = _mm_unpacklo_ps($c2, $c3);
+        let tmp1 = _mm_unpackhi_ps($c0, $c1);
+        let tmp3 = _mm_unpackhi_ps($c2, $c3);
+
+        $c0 = _mm_movelh_ps(tmp0, tmp2);
+        $c1 = _mm_movehl_ps(tmp2, tmp0);
+        $c2 = _mm_movelh_ps(tmp1, tmp3);
+        $c3 = _mm_movehl_ps(tmp3, tmp1);
+    }}
+}
+
+#[inline(always)]
+#[cfg(any(target_arch="x86", target_arch="x86_64"))]
+unsafe fn kernel_x86_sse(k: usize, alpha: T, a: *const T, b: *const T,
+                         beta: T, c: *mut T, rsc: isize, csc: isize)
+{
+    let mut ab = [_mm_setzero_ps(); MR];
+
+    let mut bv;
+    let (mut a, mut b) = (a, b);
+
+    // Compute A B
+    for _ in 0..k {
+        bv = _mm_load_ps(b as _); // aligned due to GemmKernel::align_to
+
+        loop_m!(i, {
+            // Compute ab_i += [ai b_j+0, ai b_j+1, ai b_j+2, ai b_j+3]
+            let aiv = _mm_set1_ps(at(a, i));
+            ab[i] = _mm_add_ps(ab[i], _mm_mul_ps(aiv, bv));
+        });
+
+        a = a.add(MR);
+        b = b.add(NR);
+    }
+
+    // Compute α (A B)
+    let alphav = _mm_set1_ps(alpha);
+    loop_m!(i, ab[i] = _mm_mul_ps(alphav, ab[i]));
+
+    macro_rules! c {
+        ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize));
+    }
+
+    // C ← α A B + β C
+    let mut c = [_mm_setzero_ps(); MR];
+    let betav = _mm_set1_ps(beta);
+    if beta != 0. {
+        // Read C
+        if csc == 1 {
+            loop_m!(i, c[i] = _mm_loadu_ps(c![i, 0]));
+        } else if rsc == 1 {
+            loop_m!(i, c[i] = _mm_loadu_ps(c![0, i]));
+            mm_transpose4!(c[0], c[1], c[2], c[3]);
+        } else {
+            loop_m!(i, c[i] = _mm_set_ps(*c![i, 3], *c![i, 2], *c![i, 1], *c![i, 0]));
+        }
+        // Compute β C
+        loop_m!(i, c[i] = _mm_mul_ps(c[i], betav));
+    }
+
+    // Compute (α A B) + (β C)
+    loop_m!(i, c[i] = _mm_add_ps(c[i], ab[i]));
+
+    // Store C back to memory
+    if csc == 1 {
+        loop_m!(i, _mm_storeu_ps(c![i, 0], c[i]));
+    } else if rsc == 1 {
+        mm_transpose4!(c[0], c[1], c[2], c[3]);
+        loop_m!(i, _mm_storeu_ps(c![0, i], c[i]));
+    } else {
+        // extract the nth value of a vector using _mm_cvtss_f32 (extract lowest)
+        // in combination with shuffle (move nth value to first position)
+        loop_m!(i, *c![i, 0] = _mm_cvtss_f32(c[i]));
+        loop_m!(i, *c![i, 1] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 1)));
+        loop_m!(i, *c![i, 2] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 2)));
+        loop_m!(i, *c![i, 3] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 3)));
+    }
+}
diff --git a/src/aligned_alloc.rs b/src/aligned_alloc.rs
new file mode 100644
index 0000000..f0f7392
--- /dev/null
+++ b/src/aligned_alloc.rs
@@ -0,0 +1,73 @@
+
+use std::alloc;
+use std::alloc::{Layout, handle_alloc_error};
+use std::{mem, cmp};
+
+#[cfg(test)]
+use std::ops::{Deref, DerefMut};
+#[cfg(test)]
+use std::slice;
+
+pub(crate) struct Alloc<T> { ptr: *mut T, len: usize, align: usize }
+
+impl<T> Alloc<T> {
+    #[inline]
+    pub unsafe fn new(nelem: usize, align: usize) -> Self {
+        let align = cmp::max(align, mem::align_of::<T>());
+        #[cfg(debug_assertions)]
+        let layout = Layout::from_size_align(mem::size_of::<T>() * nelem, align).unwrap();
+        #[cfg(not(debug_assertions))]
+        let layout = Layout::from_size_align_unchecked(mem::size_of::<T>() * nelem, align);
+        dprint!("Allocating nelem={}, layout={:?}", nelem, layout);
+        let ptr = alloc::alloc(layout);
+        if ptr.is_null() {
+            handle_alloc_error(layout);
+        }
+        Alloc {
+            ptr: ptr as *mut T,
+            len: nelem,
+            align,
+        }
+    }
+
+    #[cfg(test)]
+    pub fn init_with(mut self, elt: T) -> Alloc<T> where T: Copy
+    {
+        for elt1 in &mut self[..] {
+            *elt1 = elt;
+        }
+        self
+    }
+
+    #[inline]
+    pub fn ptr_mut(&mut self) -> *mut T { self.ptr }
+}
+
+impl<T> Drop for Alloc<T> {
+    fn drop(&mut self) {
+        unsafe {
+            let layout = Layout::from_size_align_unchecked(mem::size_of::<T>() * self.len, self.align);
+            alloc::dealloc(self.ptr as _, layout);
+        }
+    }
+}
+
+#[cfg(test)]
+impl<T> Deref for Alloc<T> {
+    type Target = [T];
+    fn deref(&self) -> &[T] {
+        unsafe {
+            slice::from_raw_parts(self.ptr, self.len)
+        }
+    }
+}
+
+#[cfg(test)]
+impl<T> DerefMut for Alloc<T> {
+    fn deref_mut(&mut self) -> &mut [T] {
+        unsafe {
+            slice::from_raw_parts_mut(self.ptr, self.len)
+        }
+    }
+}
+
diff --git a/src/archmacros_x86.rs b/src/archmacros_x86.rs
new file mode 100644
index 0000000..5ef1e23
--- /dev/null
+++ b/src/archmacros_x86.rs
@@ -0,0 +1,16 @@
+#![cfg(any(target_arch="x86", target_arch="x86_64"))]
+
+macro_rules! compile_env_enabled {
+    ($($name:tt)*) => {
+        !option_env!($($name)*).unwrap_or("").is_empty()
+    }
+}
+
+macro_rules! is_x86_feature_detected_ {
+    ($name:tt) => {
+        // for testing purposes, we can disable a feature at compile time by
+        // setting MMNO_avx=1 etc.
+        !compile_env_enabled!(concat!("MMNO_", $name)) && is_x86_feature_detected!($name)
+    }
+}
+
diff --git a/src/archparam.rs b/src/archparam.rs
index ed31432..d1cf9e8 100644
--- a/src/archparam.rs
+++ b/src/archparam.rs
@@ -1,4 +1,4 @@
-// Copyright 2016 bluss
+// Copyright 2016 - 2018 Ulrik Sverdrup "bluss"
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
diff --git a/src/debugmacros.rs b/src/debugmacros.rs
index 5494fb1..2ce62ff 100644
--- a/src/debugmacros.rs
+++ b/src/debugmacros.rs
@@ -1,4 +1,4 @@
-// Copyright 2016 bluss
+// Copyright 2016 - 2018 Ulrik Sverdrup "bluss"
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
diff --git a/src/dgemm_kernel.rs b/src/dgemm_kernel.rs
index de79688..e4ab4d8 100644
--- a/src/dgemm_kernel.rs
+++ b/src/dgemm_kernel.rs
@@ -1,4 +1,4 @@
-// Copyright 2016 bluss
+// Copyright 2016 - 2018 Ulrik Sverdrup "bluss"
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -72,12 +72,44 @@ impl GemmKernel for Gemm {
 pub unsafe fn kernel(k: usize, alpha: T, a: *const T, b: *const T,
                      beta: T, c: *mut T, rsc: isize, csc: isize)
 {
-    // using `uninitialized` is a workaround for issue https://github.com/bluss/matrixmultiply/issues/9
-    let mut ab: [[T; NR]; MR] = ::std::mem::uninitialized();
+    // dispatch to specific compiled versions
+    #[cfg(any(target_arch="x86", target_arch="x86_64"))]
+    {
+        if is_x86_feature_detected_!("avx") {
+            return kernel_target_avx(k, alpha, a, b, beta, c, rsc, csc);
+        } else if is_x86_feature_detected_!("sse2") {
+            return kernel_target_sse2(k, alpha, a, b, beta, c, rsc, csc);
+        }
+    }
+    kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc);
+}
+
+#[inline]
+#[target_feature(enable="avx")]
+#[cfg(any(target_arch="x86", target_arch="x86_64"))]
+pub unsafe fn kernel_target_avx(k: usize, alpha: T, a: *const T, b: *const T,
+                         beta: T, c: *mut T, rsc: isize, csc: isize)
+{
+    kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc)
+}
+
+#[inline]
+#[target_feature(enable="sse2")]
+#[cfg(any(target_arch="x86", target_arch="x86_64"))]
+pub unsafe fn kernel_target_sse2(k: usize, alpha: T, a: *const T, b: *const T,
+                                 beta: T, c: *mut T, rsc: isize, csc: isize)
+{
+    kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc)
+}
+
+#[inline(always)]
+pub unsafe fn kernel_fallback_impl(k: usize, alpha: T, a: *const T, b: *const T,
+                                   beta: T, c: *mut T, rsc: isize, csc: isize)
+{
+    let mut ab: [[T; NR]; MR] = [[0.; NR]; MR];
     let mut a = a;
     let mut b = b;
     debug_assert_eq!(beta, 0.); // always masked
-    loop_m!(i, loop_n!(j, ab[i][j] = 0.));
 
     // Compute matrix multiplication into ab[i][j]
     unroll_by!(4 => k, {
@@ -100,22 +132,82 @@ unsafe fn at(ptr: *const T, i: usize) -> T {
     *ptr.offset(i as isize)
 }
 
-#[test]
-fn test_gemm_kernel() {
-    let mut a = [1.; 32];
-    let mut b = [0.; 16];
-    for (i, x) in a.iter_mut().enumerate() {
-        *x = i as f64;
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use aligned_alloc::Alloc;
+
+    fn aligned_alloc<T>(elt: T, n: usize) -> Alloc<T> where T: Copy
+    {
+        unsafe {
+            Alloc::new(n, Gemm::align_to()).init_with(elt)
+        }
     }
-    for i in 0..4 {
-        b[i + i * 4] = 1.;
+
+    use super::T;
+    type KernelFn = unsafe fn(usize, T, *const T, *const T, T, *mut T, isize, isize);
+
+    fn test_a_kernel(_name: &str, kernel_fn: KernelFn) {
+        const K: usize = 4;
+        let mut a = aligned_alloc(1., MR * K);
+        let mut b = aligned_alloc(0., NR * K);
+        for (i, x) in a.iter_mut().enumerate() {
+            *x = i as _;
+        }
+
+        for i in 0..K {
+            b[i + i * NR] = 1.;
+        }
+        let mut c = [0.; MR * NR];
+        unsafe {
+            kernel_fn(K, 1., &a[0], &b[0], 0., &mut c[0], 1, MR as isize);
+            // col major C
+        }
+        assert_eq!(&a[..], &c[..a.len()]);
     }
-    let mut c = [0.; 32];
-    unsafe {
-        kernel(4, 1., &a[0], &b[0],
-               0., &mut c[0], 1, 8);
-        // transposed C so that results line up
+
+    #[test]
+    fn test_native_kernel() {
+        test_a_kernel("kernel", kernel);
+    }
+
+    #[test]
+    fn test_kernel_fallback_impl() {
+        test_a_kernel("kernel", kernel_fallback_impl);
+    }
+
+    #[test]
+    fn test_loop_m_n() {
+        let mut m = [[0; NR]; MR];
+        loop_m!(i, loop_n!(j, m[i][j] += 1));
+        for arr in &m[..] {
+            for elt in &arr[..] {
+                assert_eq!(*elt, 1);
+            }
+        }
     }
-    assert_eq!(&a, &c);
-}
 
+    mod test_arch_kernels {
+        use super::test_a_kernel;
+        macro_rules! test_arch_kernels_x86 {
+            ($($feature_name:tt, $function_name:ident),*) => {
+                $(
+                #[test]
+                fn $function_name() {
+                    if is_x86_feature_detected_!($feature_name) {
+                        test_a_kernel(stringify!($function_name), super::super::$function_name);
+                    } else {
+                        println!("Skipping, host does not have feature: {:?}", $feature_name);
+                    }
+                }
+                )*
+            }
+        }
+
+        #[cfg(any(target_arch="x86", target_arch="x86_64"))]
+        test_arch_kernels_x86! {
+            "avx", kernel_target_avx,
+            "sse2", kernel_target_sse2
+        }
+    }
+}
diff --git a/src/gemm.rs b/src/gemm.rs
index 64dfb1f..b3e7cbf 100644
--- a/src/gemm.rs
+++ b/src/gemm.rs
@@ -1,4 +1,4 @@
-// Copyright 2016 bluss
+// Copyright 2016 - 2018 Ulrik Sverdrup "bluss"
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -9,6 +9,8 @@
 use std::cmp::min;
 use std::mem::size_of;
 
+use aligned_alloc::Alloc;
+
 use util::range_chunk;
 use util::round_up_to;
 
@@ -136,9 +138,9 @@ unsafe fn gemm_loop<K>(
     let kmc = K::mc();
     ensure_kernel_params::<K>();
 
-    let (mut packv, bp_offset) = packing_vec::<K>(m, k, n);
-    let app = make_aligned_vec_ptr(K::align_to(), &mut packv);
-    let bpp = app.offset(bp_offset);
+    let (mut packing_buffer, bp_offset) = make_packing_buffer::<K>(m, k, n);
+    let app = packing_buffer.ptr_mut();
+    let bpp = app.add(bp_offset);
 
     // LOOP 5: split n into nc parts
     for (l5, nc) in range_chunk(n, knc) {
@@ -151,7 +153,6 @@ unsafe fn gemm_loop<K>(
             dprint!("LOOP 4, {}, kc={}", l4, kc);
             let b = b.stride_offset(rsb, kkc * l4);
             let a = a.stride_offset(csa, kkc * l4);
-            debug!(for elt in &mut packv { *elt = <_>::one(); });
 
             // Pack B -> B~
             pack(kc, nc, K::nr(), bpp, b, csb, rsb);
@@ -232,10 +233,12 @@ unsafe fn gemm_packed<K>(nc: usize, kc: usize, mc: usize,
 /// but we can make them smaller if the matrix is smaller than this (just ensure
 /// we have rounded up to a multiple of the kernel size).
 ///
-/// Return packing vector and offset to start of b
-unsafe fn packing_vec<K>(m: usize, k: usize, n: usize) -> (Vec<K::Elem>, isize)
+/// Return packing buffer and offset to start of b
+unsafe fn make_packing_buffer<K>(m: usize, k: usize, n: usize) -> (Alloc<K::Elem>, usize)
     where K: GemmKernel,
 {
+    // max alignment requirement is a multiple of min(MR, NR) * sizeof<Elem>
+    // because apack_size is a multiple of MR, start of b aligns fine
     let m = min(m, K::mc());
     let k = min(k, K::kc());
     let n = min(n, K::nc());
@@ -244,29 +247,13 @@ unsafe fn packing_vec<K>(m: usize, k: usize, n: usize) -> (Vec<K::Elem>, isize)
     let apack_size = k * round_up_to(m, K::mr());
     let bpack_size = k * round_up_to(n, K::nr());
     let nelem = apack_size + bpack_size;
-    let mut v = Vec::with_capacity(nelem);
-    v.set_len(nelem);
+
     dprint!("packed nelem={}, apack={}, bpack={},
              m={} k={} n={}",
              nelem, apack_size, bpack_size,
              m,k,n);
-    // max alignment requirement is a multiple of min(MR, NR) * sizeof<Elem>
-    // because apack_size is a multiple of MR, start of b aligns fine
-    (v, apack_size as isize)
-}
 
-/// Align a pointer into the vec. Will reallocate to fit & shift the pointer
-/// forwards if needed. This invalidates any previous pointers into the v.
-unsafe fn make_aligned_vec_ptr<U>(align_to: usize, v: &mut Vec<U>) -> *mut U {
-    let mut ptr = v.as_mut_ptr();
-    if align_to != 0 {
-        if v.as_ptr() as usize % align_to != 0 {
-            let cap = v.capacity();
-            v.reserve_exact(cap + align_to / size_of::<U>() - 1);
-            ptr = align_ptr(align_to, v.as_mut_ptr());
-        }
-    }
-    ptr
+    (Alloc::new(nelem, K::align_to()), apack_size)
 }
 
 /// offset the ptr forwards to align to a specific byte count
diff --git a/src/kernel.rs b/src/kernel.rs
index 9a891a6..1d59dcd 100644
--- a/src/kernel.rs
+++ b/src/kernel.rs
@@ -1,4 +1,4 @@
-// Copyright 2016 bluss
+// Copyright 2016 - 2018 Ulrik Sverdrup "bluss"
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
diff --git a/src/lib.rs b/src/lib.rs
index 71e5afb..b87cfc1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,4 @@
-// Copyright 2016 bluss
+// Copyright 2016 - 2018 Ulrik Sverdrup "bluss"
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -52,6 +52,7 @@
 
 extern crate rawpointer;
 
+#[macro_use] mod archmacros_x86;
 #[macro_use] mod debugmacros;
 #[macro_use] mod loopmacros;
 mod archparam;
@@ -60,6 +61,7 @@ mod gemm;
 mod sgemm_kernel;
 mod dgemm_kernel;
 mod util;
+mod aligned_alloc;
 
 pub use gemm::sgemm;
 pub use gemm::dgemm;
diff --git a/src/loopmacros.rs b/src/loopmacros.rs
index c13e0b1..8e40d81 100644
--- a/src/loopmacros.rs
+++ b/src/loopmacros.rs
@@ -1,4 +1,4 @@
-// Copyright 2016 bluss
+// Copyright 2016 - 2018 Ulrik Sverdrup "bluss"
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -6,16 +6,6 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-macro_rules! if_cfg {
-    ($id:ident, $type_:ty, $x:expr, $y:expr) => {{
-        #[cfg($id)]
-        const TMP: $type_ = $x;
-        #[cfg(not($id))]
-        const TMP: $type_ = $y;
-        TMP
-    }}
-}
-
 // Unroll only in non-debug builds
 
 #[cfg(not(debug_assertions))]
@@ -79,3 +69,34 @@ macro_rules! unroll_by {
         }
     }}
 }
+
+#[cfg(debug_assertions)]
+macro_rules! unroll_by_with_last {
+    ($by:tt => $ntimes:expr, $is_last:ident, $e:expr) => {{
+        let k = $ntimes - 1;
+        let $is_last = false;
+        for _ in 0..k {
+            $e;
+        }
+        let $is_last = true;
+        #[allow(unused_assignments)]
+        $e;
+    }}
+}
+
+#[cfg(not(debug_assertions))]
+macro_rules! unroll_by_with_last {
+    ($by:tt => $ntimes:expr, $is_last:ident, $e:expr) => {{
+        let k = $ntimes - 1;
+        let $is_last = false;
+        for _ in 0..k / $by {
+            repeat!($by $e);
+        }
+        for _ in 0..k % $by {
+            $e;
+        }
+        let $is_last = true;
+        #[allow(unused_assignments)]
+        $e;
+    }}
+}
diff --git a/src/sgemm_kernel.rs b/src/sgemm_kernel.rs
index e223632..be0eefb 100644
--- a/src/sgemm_kernel.rs
+++ b/src/sgemm_kernel.rs
@@ -1,4 +1,4 @@
-// Copyright 2016 bluss
+// Copyright 2016 - 2018 Ulrik Sverdrup "bluss"
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
@@ -9,25 +9,27 @@
 use kernel::GemmKernel;
 use archparam;
 
+
+#[cfg(target_arch="x86")]
+use std::arch::x86::*;
+#[cfg(target_arch="x86_64")]
+use std::arch::x86_64::*;
+
 pub enum Gemm { }
 
 pub type T = f32;
 
-const MR: usize = if_cfg!(sgemm_8x8, usize, 8, 4);
+const MR: usize = 8;
 const NR: usize = 8;
 
-#[cfg(sgemm_8x8)]
 macro_rules! loop_m { ($i:ident, $e:expr) => { loop8!($i, $e) }; }
-#[cfg(not(sgemm_8x8))]
-macro_rules! loop_m { ($i:ident, $e:expr) => { loop4!($i, $e) }; }
-
 macro_rules! loop_n { ($j:ident, $e:expr) => { loop8!($j, $e) }; }
 
 impl GemmKernel for Gemm {
     type Elem = T;
 
     #[inline(always)]
-    fn align_to() -> usize { 0 }
+    fn align_to() -> usize { 32 }
 
     #[inline(always)]
     fn mr() -> usize { MR }
@@ -35,7 +37,7 @@ impl GemmKernel for Gemm {
     fn nr() -> usize { NR }
 
     #[inline(always)]
-    fn always_masked() -> bool { true }
+    fn always_masked() -> bool { false }
 
     #[inline(always)]
     fn nc() -> usize { archparam::S_NC }
@@ -68,19 +70,296 @@ impl GemmKernel for Gemm {
 /// + rsc: row stride of c
 /// + csc: col stride of c
 /// + if beta is 0, then c does not need to be initialized
-#[inline(always)]
+#[inline(never)]
 pub unsafe fn kernel(k: usize, alpha: T, a: *const T, b: *const T,
                      beta: T, c: *mut T, rsc: isize, csc: isize)
 {
-    // using `uninitialized` is a workaround for issue https://github.com/bluss/matrixmultiply/issues/9
-    let mut ab: [[T; NR]; MR] = ::std::mem::uninitialized();
+    // dispatch to specific compiled versions
+    #[cfg(any(target_arch="x86", target_arch="x86_64"))]
+    {
+        if is_x86_feature_detected_!("avx") {
+            return kernel_target_avx(k, alpha, a, b, beta, c, rsc, csc);
+        } else if is_x86_feature_detected_!("sse2") {
+            return kernel_target_sse2(k, alpha, a, b, beta, c, rsc, csc);
+        }
+    }
+    kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc);
+}
+
+#[inline]
+#[target_feature(enable="avx")]
+#[cfg(any(target_arch="x86", target_arch="x86_64"))]
+unsafe fn kernel_target_avx(k: usize, alpha: T, a: *const T, b: *const T,
+                            beta: T, c: *mut T, rsc: isize, csc: isize)
+{
+    kernel_x86_avx(k, alpha, a, b, beta, c, rsc, csc)
+}
+
+#[inline]
+#[target_feature(enable="sse2")]
+#[cfg(any(target_arch="x86", target_arch="x86_64"))]
+unsafe fn kernel_target_sse2(k: usize, alpha: T, a: *const T, b: *const T,
+                             beta: T, c: *mut T, rsc: isize, csc: isize)
+{
+    kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc)
+}
+
+#[inline(always)]
+#[cfg(any(target_arch="x86", target_arch="x86_64"))]
+unsafe fn kernel_x86_avx(k: usize, alpha: T, a: *const T, b: *const T,
+                         beta: T, c: *mut T, rsc: isize, csc: isize)
+{
+    debug_assert_ne!(k, 0);
+
+    let mut ab = [_mm256_setzero_ps(); MR];
+
+    // this kernel can operate in either transposition (C = A B or C^T = B^T A^T)
+    const PREFER_ROW_MAJOR_C: bool = true;
+
+    let (mut a, mut b) = if PREFER_ROW_MAJOR_C { (a, b) } else { (b, a) };
+    let (rsc, csc) = if PREFER_ROW_MAJOR_C { (rsc, csc) } else { (csc, rsc) };
+
+    macro_rules! shuffle_mask {
+        ($z:expr, $y:expr, $x:expr, $w:expr) => {
+            ($z << 6) | ($y << 4) | ($x << 2) | $w
+        }
+    }
+    macro_rules! permute_mask {
+        ($z:expr, $y:expr, $x:expr, $w:expr) => {
+            ($z << 6) | ($y << 4) | ($x << 2) | $w
+        }
+    }
+
+    macro_rules! permute2f128_mask {
+        ($y:expr, $x:expr) => {
+            (($y << 4) | $x)
+        }
+    }
+
+    // Start data load before each iteration
+    let mut av = _mm256_load_ps(a);
+    let mut bv = _mm256_load_ps(b);
+
+    // Compute A B
+    unroll_by_with_last!(4 => k, is_last, {
+        // We compute abij = ai bj
+        //
+        // Load b as one contiguous vector
+        // Load a as striped vectors
+        //
+        // Shuffle the abij elements in order after the loop.
+        //
+        // Note this scheme copied and transposed from the BLIS 8x8 sgemm
+        // microkernel.
+        //
+        // Our a indices are striped and our b indices are linear. In
+        // the variable names below, we always have doubled indices so
+        // for example a0246 corresponds to a vector of a0 a0 a2 a2 a4 a4 a6 a6.
+        //
+        // ab0246: ab2064: ab4602: ab6420:
+        // ( ab00  ( ab20  ( ab40  ( ab60
+        //   ab01    ab21    ab41    ab61
+        //   ab22    ab02    ab62    ab42
+        //   ab23    ab03    ab63    ab43
+        //   ab44    ab64    ab04    ab24
+        //   ab45    ab65    ab05    ab25
+        //   ab66    ab46    ab26    ab06
+        //   ab67 )  ab47 )  ab27 )  ab07 )
+        //
+        // ab1357: ab3175: ab5713: ab7531:
+        // ( ab10  ( ab30  ( ab50  ( ab70
+        //   ab11    ab31    ab51    ab71
+        //   ab32    ab12    ab72    ab52
+        //   ab33    ab13    ab73    ab53
+        //   ab54    ab74    ab14    ab34
+        //   ab55    ab75    ab15    ab35
+        //   ab76    ab56    ab36    ab16
+        //   ab77 )  ab57 )  ab37 )  ab17 )
+
+        const PERM32_2301: i32 = permute_mask!(1, 0, 3, 2);
+        const PERM128_30: i32 = permute2f128_mask!(0, 3);
+
+        // _mm256_moveldup_ps(av):
+        // vmovsldup ymm2, ymmword ptr [rax]
+        //
+        // Load and duplicate each even word:
+        // ymm2 ← [a0 a0 a2 a2 a4 a4 a6 a6]
+        //
+        // _mm256_movehdup_ps(av):
+        // vmovshdup ymm2, ymmword ptr [rax]
+        //
+        // Load and duplicate each odd word:
+        // ymm2 ← [a1 a1 a3 a3 a5 a5 a7 a7]
+        //
+
+        let a0246 = _mm256_moveldup_ps(av); // Load: a0 a0 a2 a2 a4 a4 a6 a6
+        let a2064 = _mm256_permute_ps(a0246, PERM32_2301);
+
+        let a1357 = _mm256_movehdup_ps(av); // Load: a1 a1 a3 a3 a5 a5 a7 a7
+        let a3175 = _mm256_permute_ps(a1357, PERM32_2301);
+
+        let a4602 = _mm256_permute2f128_ps(a0246, a0246, PERM128_30);
+        let a6420 = _mm256_permute2f128_ps(a2064, a2064, PERM128_30);
+
+        let a5713 = _mm256_permute2f128_ps(a1357, a1357, PERM128_30);
+        let a7531 = _mm256_permute2f128_ps(a3175, a3175, PERM128_30);
+
+        ab[0] = _mm256_add_ps(ab[0], _mm256_mul_ps(a0246, bv));
+        ab[1] = _mm256_add_ps(ab[1], _mm256_mul_ps(a2064, bv));
+        ab[2] = _mm256_add_ps(ab[2], _mm256_mul_ps(a4602, bv));
+        ab[3] = _mm256_add_ps(ab[3], _mm256_mul_ps(a6420, bv));
+
+        ab[4] = _mm256_add_ps(ab[4], _mm256_mul_ps(a1357, bv));
+        ab[5] = _mm256_add_ps(ab[5], _mm256_mul_ps(a3175, bv));
+        ab[6] = _mm256_add_ps(ab[6], _mm256_mul_ps(a5713, bv));
+        ab[7] = _mm256_add_ps(ab[7], _mm256_mul_ps(a7531, bv));
+
+        if !is_last {
+            a = a.add(MR);
+            b = b.add(NR);
+
+            bv = _mm256_load_ps(b);
+            av = _mm256_load_ps(a);
+        }
+    });
+
+    let alphav = _mm256_set1_ps(alpha);
+
+    // Permute to put the abij elements in order
+    //
+    // shufps 0xe4: 22006644 00224466 -> 22226666
+    //
+    // vperm2 0x30: 00004444 44440000 -> 00000000
+    // vperm2 0x12: 00004444 44440000 -> 44444444
+    //
+    
+    let ab0246 = ab[0];
+    let ab2064 = ab[1];
+    let ab4602 = ab[2];
+    let ab6420 = ab[3];
+
+    let ab1357 = ab[4];
+    let ab3175 = ab[5];
+    let ab5713 = ab[6];
+    let ab7531 = ab[7];
+
+    const SHUF_0123: i32 = shuffle_mask!(3, 2, 1, 0);
+    debug_assert_eq!(SHUF_0123, 0xE4);
+
+    const PERM128_03: i32 = permute2f128_mask!(3, 0);
+    const PERM128_21: i32 = permute2f128_mask!(1, 2);
+
+    // No elements are "shuffled" in truth, they all stay at their index
+    // but we combine vectors to de-stripe them.
+    //
+    // For example, the first shuffle below uses 0 1 2 3 which
+    // corresponds to the X0 X1 Y2 Y3 sequence etc:
+    //
+    //                                             variable
+    // X ab00 ab01 ab22 ab23 ab44 ab45 ab66 ab67   ab0246
+    // Y ab20 ab21 ab02 ab03 ab64 ab65 ab46 ab47   ab2064
+    // 
+    //   X0   X1   Y2   Y3   X4   X5   Y6   Y7
+    // = ab00 ab01 ab02 ab03 ab44 ab45 ab46 ab47   ab0044
+
+    let ab0044 = _mm256_shuffle_ps(ab0246, ab2064, SHUF_0123);
+    let ab2266 = _mm256_shuffle_ps(ab2064, ab0246, SHUF_0123);
+
+    let ab4400 = _mm256_shuffle_ps(ab4602, ab6420, SHUF_0123);
+    let ab6622 = _mm256_shuffle_ps(ab6420, ab4602, SHUF_0123);
+
+    let ab1155 = _mm256_shuffle_ps(ab1357, ab3175, SHUF_0123);
+    let ab3377 = _mm256_shuffle_ps(ab3175, ab1357, SHUF_0123);
+
+    let ab5511 = _mm256_shuffle_ps(ab5713, ab7531, SHUF_0123);
+    let ab7733 = _mm256_shuffle_ps(ab7531, ab5713, SHUF_0123);
+
+    let ab0000 = _mm256_permute2f128_ps(ab0044, ab4400, PERM128_03);
+    let ab4444 = _mm256_permute2f128_ps(ab0044, ab4400, PERM128_21);
+
+    let ab2222 = _mm256_permute2f128_ps(ab2266, ab6622, PERM128_03);
+    let ab6666 = _mm256_permute2f128_ps(ab2266, ab6622, PERM128_21);
+
+    let ab1111 = _mm256_permute2f128_ps(ab1155, ab5511, PERM128_03);
+    let ab5555 = _mm256_permute2f128_ps(ab1155, ab5511, PERM128_21);
+
+    let ab3333 = _mm256_permute2f128_ps(ab3377, ab7733, PERM128_03);
+    let ab7777 = _mm256_permute2f128_ps(ab3377, ab7733, PERM128_21);
+
+    ab[0] = ab0000;
+    ab[1] = ab1111;
+    ab[2] = ab2222;
+    ab[3] = ab3333;
+    ab[4] = ab4444;
+    ab[5] = ab5555;
+    ab[6] = ab6666;
+    ab[7] = ab7777;
+
+    // Compute α (A B)
+    loop_m!(i, ab[i] = _mm256_mul_ps(alphav, ab[i]));
+
+    macro_rules! c {
+        ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize));
+    }
+
+    // C ← α A B + β C
+    let mut c = [_mm256_setzero_ps(); MR];
+    let betav = _mm256_set1_ps(beta);
+    if beta != 0. {
+        // Read C
+        if csc == 1 {
+            loop_m!(i, c[i] = _mm256_loadu_ps(c![i, 0]));
+        // Handle rsc == 1 case with transpose?
+        } else {
+            loop_m!(i, c[i] = _mm256_set_ps(*c![i, 7], *c![i, 6], *c![i, 5], *c![i, 4], *c![i, 3], *c![i, 2], *c![i, 1], *c![i, 0]));
+        }
+        // Compute β C
+        loop_m!(i, c[i] = _mm256_mul_ps(c[i], betav));
+    }
+
+    // Compute (α A B) + (β C)
+    loop_m!(i, c[i] = _mm256_add_ps(c[i], ab[i]));
+
+    // Store C back to memory
+    if csc == 1 {
+        loop_m!(i, _mm256_storeu_ps(c![i, 0], c[i]));
+    // Handle rsc == 1 case with transpose?
+    } else {
+        // Permute to bring each element in the vector to the front and store
+        loop_m!(i, {
+            let clo = _mm256_extractf128_ps(c[i], 0);
+            let chi = _mm256_extractf128_ps(c[i], 1);
+
+            _mm_store_ss(c![i, 0], clo);
+            let cperm = _mm_permute_ps(clo, permute_mask!(0, 3, 2, 1));
+            _mm_store_ss(c![i, 1], cperm);
+            let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1));
+            _mm_store_ss(c![i, 2], cperm);
+            let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1));
+            _mm_store_ss(c![i, 3], cperm);
+
+
+            _mm_store_ss(c![i, 4], chi);
+            let cperm = _mm_permute_ps(chi, permute_mask!(0, 3, 2, 1));
+            _mm_store_ss(c![i, 5], cperm);
+            let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1));
+            _mm_store_ss(c![i, 6], cperm);
+            let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1));
+            _mm_store_ss(c![i, 7], cperm);
+        });
+    }
+}
+
+#[inline]
+unsafe fn kernel_fallback_impl(k: usize, alpha: T, a: *const T, b: *const T,
+                               beta: T, c: *mut T, rsc: isize, csc: isize)
+{
+    let mut ab: [[T; NR]; MR] = [[0.; NR]; MR];
     let mut a = a;
     let mut b = b;
-    debug_assert_eq!(beta, 0.); // always masked
-    loop_m!(i, loop_n!(j, ab[i][j] = 0.));
 
-    // Compute matrix multiplication into ab[i][j]
-    unroll_by!(5 => k, {
+    // Compute A B into ab[i][j]
+    unroll_by!(4 => k, {
         loop_m!(i, loop_n!(j, ab[i][j] += at(a, i) * at(b, j)));
 
         a = a.offset(MR as isize);
@@ -91,8 +370,12 @@ pub unsafe fn kernel(k: usize, alpha: T, a: *const T, b: *const T,
         ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize));
     }
 
-    // set C = α A B
-    loop_n!(j, loop_m!(i, *c![i, j] = alpha * ab[i][j]));
+    // set C = α A B + β C
+    if beta == 0. {
+        loop_n!(j, loop_m!(i, *c![i, j] = alpha * ab[i][j]));
+    } else {
+        loop_n!(j, loop_m!(i, *c![i, j] = *c![i, j] * beta + alpha * ab[i][j]));
+    }
 }
 
 #[inline(always)]
@@ -100,22 +383,82 @@ unsafe fn at(ptr: *const T, i: usize) -> T {
     *ptr.offset(i as isize)
 }
 
-#[test]
-fn test_gemm_kernel() {
-    let mut a = [1.; 16];
-    let mut b = [0.; 32];
-    for (i, x) in a.iter_mut().enumerate() {
-        *x = i as f32;
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use aligned_alloc::Alloc;
+
+    fn aligned_alloc<T>(elt: T, n: usize) -> Alloc<T> where T: Copy
+    {
+        unsafe {
+            Alloc::new(n, Gemm::align_to()).init_with(elt)
+        }
+    }
+
+    use super::T;
+    type KernelFn = unsafe fn(usize, T, *const T, *const T, T, *mut T, isize, isize);
+
+    fn test_a_kernel(_name: &str, kernel_fn: KernelFn) {
+        const K: usize = 4;
+        let mut a = aligned_alloc(1., MR * K);
+        let mut b = aligned_alloc(0., NR * K);
+        for (i, x) in a.iter_mut().enumerate() {
+            *x = i as _;
+        }
+
+        for i in 0..K {
+            b[i + i * NR] = 1.;
+        }
+        let mut c = [0.; MR * NR];
+        unsafe {
+            kernel_fn(K, 1., &a[0], &b[0], 0., &mut c[0], 1, MR as isize);
+            // col major C
+        }
+        assert_eq!(&a[..], &c[..a.len()]);
     }
 
-    for i in 0..4 {
-        b[i + i * 8] = 1.;
+    #[test]
+    fn test_native_kernel() {
+        test_a_kernel("kernel", kernel);
     }
-    let mut c = [0.; 32];
-    unsafe {
-        kernel(4, 1., &a[0], &b[0], 0., &mut c[0], 1, 4);
-        // col major C
+
+    #[test]
+    fn test_kernel_fallback_impl() {
+        test_a_kernel("kernel", kernel_fallback_impl);
     }
-    assert_eq!(&a, &c[..16]);
-}
 
+    #[test]
+    fn test_loop_m_n() {
+        let mut m = [[0; NR]; MR];
+        loop_m!(i, loop_n!(j, m[i][j] += 1));
+        for arr in &m[..] {
+            for elt in &arr[..] {
+                assert_eq!(*elt, 1);
+            }
+        }
+    }
+
+    mod test_arch_kernels {
+        use super::test_a_kernel;
+        macro_rules! test_arch_kernels_x86 {
+            ($($feature_name:tt, $function_name:ident),*) => {
+                $(
+                #[test]
+                fn $function_name() {
+                    if is_x86_feature_detected_!($feature_name) {
+                        test_a_kernel(stringify!($function_name), super::super::$function_name);
+                    } else {
+                        println!("Skipping, host does not have feature: {:?}", $feature_name);
+                    }
+                }
+                )*
+            }
+        }
+
+        #[cfg(any(target_arch="x86", target_arch="x86_64"))]
+        test_arch_kernels_x86! {
+            "avx", kernel_target_avx,
+            "sse2", kernel_target_sse2
+        }
+    }
+}
diff --git a/src/util.rs b/src/util.rs
index 5ddc84c..a3524b2 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -1,4 +1,4 @@
-// Copyright 2016 bluss
+// Copyright 2016 - 2018 Ulrik Sverdrup "bluss"
 //
 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
diff --git a/tests/sgemm.rs b/tests/sgemm.rs
index c297b5a..7137354 100644
--- a/tests/sgemm.rs
+++ b/tests/sgemm.rs
@@ -6,7 +6,7 @@ use std::fmt::{Display, Debug};
 trait Float : Copy + Display + Debug + PartialEq {
     fn zero() -> Self;
     fn one() -> Self;
-    fn from(i64) -> Self;
+    fn from(x: i64) -> Self;
     fn nan() -> Self;
 }