diff --git a/.travis.yml b/.travis.yml index 2b60f12..db4ed82 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,21 +1,49 @@ language: rust sudo: false -# run builds for all the trains (and more) -rust: - - 1.12.0 - - stable - - beta - - nightly +matrix: + include: + - rust: 1.28.0 + env: + TARGET=x86_64-unknown-linux-gnu + - rust: stable + env: + TARGET=x86_64-unknown-linux-gnu + - rust: stable + env: + TARGET=i686-unknown-linux-gnu + - rust: beta + env: + TARGET=x86_64-unknown-linux-gnu + - rust: nightly + env: + TARGET=x86_64-unknown-linux-gnu + - rust: nightly + env: + TARGET=aarch64-unknown-linux-gnu + BUILD_ONLY=1 +env: + global: + - HOST=x86_64-unknown-linux-gnu + +addons: + apt: + packages: + # needed for i686-unknown-linux-gnu target + - gcc-multilib +install: + # "rustup error: cannot re-add" without this conditional check +- if [[ $HOST != $TARGET ]]; then rustup target add $TARGET; fi # the main build script: - | - cargo build && - cargo test && - cargo test --release && - cargo doc && - cargo bench + cargo build --target=$TARGET && + ([ -n "$BUILD_ONLY" ] || ( + cargo test --target=$TARGET && + cargo test --release --target=$TARGET && + cargo doc --target=$TARGET && + cargo bench --target=$TARGET )) branches: only: diff --git a/Cargo.toml b/Cargo.toml index 28e8a65..bf95480 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,6 @@ description = "General matrix multiplication of f32 and f64 matrices in Rust. Su keywords = ["matrix", "sgemm", "dgemm"] -build = "build.rs" - [lib] bench = false diff --git a/LICENSE-MIT b/LICENSE-MIT index 9203baa..e2d8219 100644 --- a/LICENSE-MIT +++ b/LICENSE-MIT @@ -1,4 +1,4 @@ -Copyright (c) 2015 +Copyright (c) 2016 - 2018 Ulrik Sverdrup "bluss" Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index 2df0187..a3bd3fd 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -40,43 +40,44 @@ macro_rules! mat_mul { }; } -benchmark_main!(mat_mul_f32, mat_mul_f64, ref_mat_mul_f32); +benchmark_main!(mat_mul_f32, mat_mul_f64); mat_mul!{mat_mul_f32, sgemm, (m004, 4, 4, 4) - (m005, 5, 5, 5) (m006, 6, 6, 6) - (m007, 7, 7, 7) (m008, 8, 8, 8) - (m009, 9, 9, 9) (m012, 12, 12, 12) (m016, 16, 16, 16) (m032, 32, 32, 32) (m064, 64, 64, 64) (m127, 127, 127, 127) + /* (m256, 256, 256, 256) (m512, 512, 512, 512) (mix16x4, 32, 4, 32) (mix32x2, 32, 2, 32) (mix97, 97, 97, 125) (mix128x10000x128, 128, 10000, 128) + */ } mat_mul!{mat_mul_f64, dgemm, (m004, 4, 4, 4) - (m007, 7, 7, 7) + (m006, 6, 6, 6) (m008, 8, 8, 8) (m012, 12, 12, 12) (m016, 16, 16, 16) (m032, 32, 32, 32) (m064, 64, 64, 64) (m127, 127, 127, 127) + /* (m256, 256, 256, 256) (m512, 512, 512, 512) (mix16x4, 32, 4, 32) (mix32x2, 32, 2, 32) (mix97, 97, 97, 125) (mix128x10000x128, 128, 10000, 128) + */ } use std::ops::{Add, Mul}; diff --git a/blas-bench/Cargo.toml b/blas-bench/Cargo.toml new file mode 100644 index 0000000..d6ce102 --- /dev/null +++ b/blas-bench/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "blas-bench" +version = "0.1.0" +authors = ["bluss"] +publish = false + +license = "MIT/Apache-2.0" + +repository = "https://github.com/bluss/matrixmultiply/" +documentation = "" + +description = "Blas benchmarks for comparison with matrixmultiply" + +keywords = ["matrix", "sgemm", "dgemm"] + +[lib] +bench = false + +[[bench]] +name = "benchmarks" +harness = false + +[dependencies] +rawpointer = "0.1" +matrixmultiply = { path = ".." } +blas = { version = "0.20", default-features = false } +blas-src = { version = "0.2.0", default-features = false } + + +[dev-dependencies] +bencher = "0.1.2" diff --git a/blas-bench/README.md b/blas-bench/README.md new file mode 100644 index 0000000..806583e --- /dev/null +++ b/blas-bench/README.md @@ -0,0 +1,7 @@ + +Run BLAS benchmarks to compare with matrixmultiply. + +These tests are set up to run vs a system-installed openblas (see the build.rs file), +because building all of openblas just to benchmark versus it is tedious. +So make sure openblas is installed, or other library that supports the cblas interface, +and tweak the build.rs file to suit. diff --git a/blas-bench/benches/benchmarks.rs b/blas-bench/benches/benchmarks.rs new file mode 100644 index 0000000..91fda89 --- /dev/null +++ b/blas-bench/benches/benchmarks.rs @@ -0,0 +1,81 @@ +extern crate blas_bench; +extern crate matrixmultiply; +pub use matrixmultiply::sgemm; +pub use matrixmultiply::dgemm; + +#[macro_use] +extern crate bencher; +extern crate blas; + +use std::os::raw::c_int; + + +#[allow(non_camel_case_types)] +type blas_index = c_int; // blas index type + + +// Compute GFlop/s +// by flop / s = 2 M N K / time + + +benchmark_main!(blas_mat_mul_f32, blas_mat_mul_f64); + +macro_rules! blas_mat_mul { + ($modname:ident, $gemm:ident, $(($name:ident, $m:expr, $n:expr, $k:expr))+) => { + mod $modname { + use bencher::{Bencher}; + use super::blas_index; + $( + pub fn $name(bench: &mut Bencher) + { + let a = vec![0.; $m * $n]; + let b = vec![0.; $n * $k]; + let mut c = vec![0.; $m * $k]; + bench.iter(|| { + unsafe { + + blas::$gemm( + b'N', + b'N', + $m as blas_index, // m, rows of Op(a) + $n as blas_index, // n, cols of Op(b) + $k as blas_index, // k, cols of Op(a) + 1., + &a, + $n, // lda + &b, + $k, // ldb + 0., // beta + &mut c, + $k, // ldc + ); + } + }); + } + )+ + } + benchmark_group!{ $modname, $($modname::$name),+ } + }; +} + +blas_mat_mul!{blas_mat_mul_f32, sgemm, + (m004, 4, 4, 4) + (m006, 6, 6, 6) + (m008, 8, 8, 8) + (m012, 12, 12, 12) + (m016, 16, 16, 16) + (m032, 32, 32, 32) + (m064, 64, 64, 64) + (m127, 127, 127, 127) +} + +blas_mat_mul!{blas_mat_mul_f64, dgemm, + (m004, 4, 4, 4) + (m006, 6, 6, 6) + (m008, 8, 8, 8) + (m012, 12, 12, 12) + (m016, 16, 16, 16) + (m032, 32, 32, 32) + (m064, 64, 64, 64) + (m127, 127, 127, 127) +} diff --git a/blas-bench/build.rs b/blas-bench/build.rs new file mode 100644 index 0000000..9bd617a --- /dev/null +++ b/blas-bench/build.rs @@ -0,0 +1,12 @@ + +/// +/// This build script emits the openblas linking directive if requested +/// + +fn main() { + // Always linking openblas + // Compiling blas just for testing is tedious -- install it on your system + // and run this. + println!("cargo:rerun-if-changed=build.rs"); + println!("cargo:rustc-link-lib={}=openblas", "dylib"); +} diff --git a/blas-bench/src/lib.rs b/blas-bench/src/lib.rs new file mode 100644 index 0000000..31e1bb2 --- /dev/null +++ b/blas-bench/src/lib.rs @@ -0,0 +1,7 @@ +#[cfg(test)] +mod tests { + #[test] + fn it_works() { + assert_eq!(2 + 2, 4); + } +} diff --git a/build.rs b/build.rs deleted file mode 100644 index 32f833c..0000000 --- a/build.rs +++ /dev/null @@ -1,14 +0,0 @@ - -//! - -use std::env; - -fn main() { - println!("cargo:rerun-if-changed=build.rs"); - - if let Ok(features) = env::var("CARGO_CFG_TARGET_FEATURE") { - if features.split(",").map(|s| s.trim()).any(|feat| feat == "avx") { - println!("cargo:rustc-cfg=sgemm_8x8"); - } - } -} diff --git a/spare kernels/x86_sse_sgemm.rs b/spare kernels/x86_sse_sgemm.rs new file mode 100644 index 0000000..720c93c --- /dev/null +++ b/spare kernels/x86_sse_sgemm.rs @@ -0,0 +1,84 @@ + +// 4x4 sse sgemm +macro_rules! mm_transpose4 { + ($c0:expr, $c1:expr, $c2:expr, $c3:expr) => {{ + // This is _MM_TRANSPOSE4_PS except we take variables, not references + let tmp0 = _mm_unpacklo_ps($c0, $c1); + let tmp2 = _mm_unpacklo_ps($c2, $c3); + let tmp1 = _mm_unpackhi_ps($c0, $c1); + let tmp3 = _mm_unpackhi_ps($c2, $c3); + + $c0 = _mm_movelh_ps(tmp0, tmp2); + $c1 = _mm_movehl_ps(tmp2, tmp0); + $c2 = _mm_movelh_ps(tmp1, tmp3); + $c3 = _mm_movehl_ps(tmp3, tmp1); + }} +} + +#[inline(always)] +#[cfg(any(target_arch="x86", target_arch="x86_64"))] +unsafe fn kernel_x86_sse(k: usize, alpha: T, a: *const T, b: *const T, + beta: T, c: *mut T, rsc: isize, csc: isize) +{ + let mut ab = [_mm_setzero_ps(); MR]; + + let mut bv; + let (mut a, mut b) = (a, b); + + // Compute A B + for _ in 0..k { + bv = _mm_load_ps(b as _); // aligned due to GemmKernel::align_to + + loop_m!(i, { + // Compute ab_i += [ai b_j+0, ai b_j+1, ai b_j+2, ai b_j+3] + let aiv = _mm_set1_ps(at(a, i)); + ab[i] = _mm_add_ps(ab[i], _mm_mul_ps(aiv, bv)); + }); + + a = a.add(MR); + b = b.add(NR); + } + + // Compute α (A B) + let alphav = _mm_set1_ps(alpha); + loop_m!(i, ab[i] = _mm_mul_ps(alphav, ab[i])); + + macro_rules! c { + ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); + } + + // C ← α A B + β C + let mut c = [_mm_setzero_ps(); MR]; + let betav = _mm_set1_ps(beta); + if beta != 0. { + // Read C + if csc == 1 { + loop_m!(i, c[i] = _mm_loadu_ps(c![i, 0])); + } else if rsc == 1 { + loop_m!(i, c[i] = _mm_loadu_ps(c![0, i])); + mm_transpose4!(c[0], c[1], c[2], c[3]); + } else { + loop_m!(i, c[i] = _mm_set_ps(*c![i, 3], *c![i, 2], *c![i, 1], *c![i, 0])); + } + // Compute β C + loop_m!(i, c[i] = _mm_mul_ps(c[i], betav)); + } + + // Compute (α A B) + (β C) + loop_m!(i, c[i] = _mm_add_ps(c[i], ab[i])); + + // Store C back to memory + if csc == 1 { + loop_m!(i, _mm_storeu_ps(c![i, 0], c[i])); + } else if rsc == 1 { + mm_transpose4!(c[0], c[1], c[2], c[3]); + loop_m!(i, _mm_storeu_ps(c![0, i], c[i])); + } else { + // extract the nth value of a vector using _mm_cvtss_f32 (extract lowest) + // in combination with shuffle (move nth value to first position) + loop_m!(i, *c![i, 0] = _mm_cvtss_f32(c[i])); + loop_m!(i, *c![i, 1] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 1))); + loop_m!(i, *c![i, 2] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 2))); + loop_m!(i, *c![i, 3] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 3))); + } +} diff --git a/src/aligned_alloc.rs b/src/aligned_alloc.rs new file mode 100644 index 0000000..f0f7392 --- /dev/null +++ b/src/aligned_alloc.rs @@ -0,0 +1,73 @@ + +use std::alloc; +use std::alloc::{Layout, handle_alloc_error}; +use std::{mem, cmp}; + +#[cfg(test)] +use std::ops::{Deref, DerefMut}; +#[cfg(test)] +use std::slice; + +pub(crate) struct Alloc { ptr: *mut T, len: usize, align: usize } + +impl Alloc { + #[inline] + pub unsafe fn new(nelem: usize, align: usize) -> Self { + let align = cmp::max(align, mem::align_of::()); + #[cfg(debug_assertions)] + let layout = Layout::from_size_align(mem::size_of::() * nelem, align).unwrap(); + #[cfg(not(debug_assertions))] + let layout = Layout::from_size_align_unchecked(mem::size_of::() * nelem, align); + dprint!("Allocating nelem={}, layout={:?}", nelem, layout); + let ptr = alloc::alloc(layout); + if ptr.is_null() { + handle_alloc_error(layout); + } + Alloc { + ptr: ptr as *mut T, + len: nelem, + align, + } + } + + #[cfg(test)] + pub fn init_with(mut self, elt: T) -> Alloc where T: Copy + { + for elt1 in &mut self[..] { + *elt1 = elt; + } + self + } + + #[inline] + pub fn ptr_mut(&mut self) -> *mut T { self.ptr } +} + +impl Drop for Alloc { + fn drop(&mut self) { + unsafe { + let layout = Layout::from_size_align_unchecked(mem::size_of::() * self.len, self.align); + alloc::dealloc(self.ptr as _, layout); + } + } +} + +#[cfg(test)] +impl Deref for Alloc { + type Target = [T]; + fn deref(&self) -> &[T] { + unsafe { + slice::from_raw_parts(self.ptr, self.len) + } + } +} + +#[cfg(test)] +impl DerefMut for Alloc { + fn deref_mut(&mut self) -> &mut [T] { + unsafe { + slice::from_raw_parts_mut(self.ptr, self.len) + } + } +} + diff --git a/src/archmacros_x86.rs b/src/archmacros_x86.rs new file mode 100644 index 0000000..5ef1e23 --- /dev/null +++ b/src/archmacros_x86.rs @@ -0,0 +1,16 @@ +#![cfg(any(target_arch="x86", target_arch="x86_64"))] + +macro_rules! compile_env_enabled { + ($($name:tt)*) => { + !option_env!($($name)*).unwrap_or("").is_empty() + } +} + +macro_rules! is_x86_feature_detected_ { + ($name:tt) => { + // for testing purposes, we can disable a feature at compile time by + // setting MMNO_avx=1 etc. + !compile_env_enabled!(concat!("MMNO_", $name)) && is_x86_feature_detected!($name) + } +} + diff --git a/src/archparam.rs b/src/archparam.rs index ed31432..d1cf9e8 100644 --- a/src/archparam.rs +++ b/src/archparam.rs @@ -1,4 +1,4 @@ -// Copyright 2016 bluss +// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/debugmacros.rs b/src/debugmacros.rs index 5494fb1..2ce62ff 100644 --- a/src/debugmacros.rs +++ b/src/debugmacros.rs @@ -1,4 +1,4 @@ -// Copyright 2016 bluss +// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/dgemm_kernel.rs b/src/dgemm_kernel.rs index de79688..e4ab4d8 100644 --- a/src/dgemm_kernel.rs +++ b/src/dgemm_kernel.rs @@ -1,4 +1,4 @@ -// Copyright 2016 bluss +// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -72,12 +72,44 @@ impl GemmKernel for Gemm { pub unsafe fn kernel(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { - // using `uninitialized` is a workaround for issue https://github.com/bluss/matrixmultiply/issues/9 - let mut ab: [[T; NR]; MR] = ::std::mem::uninitialized(); + // dispatch to specific compiled versions + #[cfg(any(target_arch="x86", target_arch="x86_64"))] + { + if is_x86_feature_detected_!("avx") { + return kernel_target_avx(k, alpha, a, b, beta, c, rsc, csc); + } else if is_x86_feature_detected_!("sse2") { + return kernel_target_sse2(k, alpha, a, b, beta, c, rsc, csc); + } + } + kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc); +} + +#[inline] +#[target_feature(enable="avx")] +#[cfg(any(target_arch="x86", target_arch="x86_64"))] +pub unsafe fn kernel_target_avx(k: usize, alpha: T, a: *const T, b: *const T, + beta: T, c: *mut T, rsc: isize, csc: isize) +{ + kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc) +} + +#[inline] +#[target_feature(enable="sse2")] +#[cfg(any(target_arch="x86", target_arch="x86_64"))] +pub unsafe fn kernel_target_sse2(k: usize, alpha: T, a: *const T, b: *const T, + beta: T, c: *mut T, rsc: isize, csc: isize) +{ + kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc) +} + +#[inline(always)] +pub unsafe fn kernel_fallback_impl(k: usize, alpha: T, a: *const T, b: *const T, + beta: T, c: *mut T, rsc: isize, csc: isize) +{ + let mut ab: [[T; NR]; MR] = [[0.; NR]; MR]; let mut a = a; let mut b = b; debug_assert_eq!(beta, 0.); // always masked - loop_m!(i, loop_n!(j, ab[i][j] = 0.)); // Compute matrix multiplication into ab[i][j] unroll_by!(4 => k, { @@ -100,22 +132,82 @@ unsafe fn at(ptr: *const T, i: usize) -> T { *ptr.offset(i as isize) } -#[test] -fn test_gemm_kernel() { - let mut a = [1.; 32]; - let mut b = [0.; 16]; - for (i, x) in a.iter_mut().enumerate() { - *x = i as f64; +#[cfg(test)] +mod tests { + use super::*; + use aligned_alloc::Alloc; + + fn aligned_alloc(elt: T, n: usize) -> Alloc where T: Copy + { + unsafe { + Alloc::new(n, Gemm::align_to()).init_with(elt) + } } - for i in 0..4 { - b[i + i * 4] = 1.; + + use super::T; + type KernelFn = unsafe fn(usize, T, *const T, *const T, T, *mut T, isize, isize); + + fn test_a_kernel(_name: &str, kernel_fn: KernelFn) { + const K: usize = 4; + let mut a = aligned_alloc(1., MR * K); + let mut b = aligned_alloc(0., NR * K); + for (i, x) in a.iter_mut().enumerate() { + *x = i as _; + } + + for i in 0..K { + b[i + i * NR] = 1.; + } + let mut c = [0.; MR * NR]; + unsafe { + kernel_fn(K, 1., &a[0], &b[0], 0., &mut c[0], 1, MR as isize); + // col major C + } + assert_eq!(&a[..], &c[..a.len()]); } - let mut c = [0.; 32]; - unsafe { - kernel(4, 1., &a[0], &b[0], - 0., &mut c[0], 1, 8); - // transposed C so that results line up + + #[test] + fn test_native_kernel() { + test_a_kernel("kernel", kernel); + } + + #[test] + fn test_kernel_fallback_impl() { + test_a_kernel("kernel", kernel_fallback_impl); + } + + #[test] + fn test_loop_m_n() { + let mut m = [[0; NR]; MR]; + loop_m!(i, loop_n!(j, m[i][j] += 1)); + for arr in &m[..] { + for elt in &arr[..] { + assert_eq!(*elt, 1); + } + } } - assert_eq!(&a, &c); -} + mod test_arch_kernels { + use super::test_a_kernel; + macro_rules! test_arch_kernels_x86 { + ($($feature_name:tt, $function_name:ident),*) => { + $( + #[test] + fn $function_name() { + if is_x86_feature_detected_!($feature_name) { + test_a_kernel(stringify!($function_name), super::super::$function_name); + } else { + println!("Skipping, host does not have feature: {:?}", $feature_name); + } + } + )* + } + } + + #[cfg(any(target_arch="x86", target_arch="x86_64"))] + test_arch_kernels_x86! { + "avx", kernel_target_avx, + "sse2", kernel_target_sse2 + } + } +} diff --git a/src/gemm.rs b/src/gemm.rs index 64dfb1f..b3e7cbf 100644 --- a/src/gemm.rs +++ b/src/gemm.rs @@ -1,4 +1,4 @@ -// Copyright 2016 bluss +// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -9,6 +9,8 @@ use std::cmp::min; use std::mem::size_of; +use aligned_alloc::Alloc; + use util::range_chunk; use util::round_up_to; @@ -136,9 +138,9 @@ unsafe fn gemm_loop( let kmc = K::mc(); ensure_kernel_params::(); - let (mut packv, bp_offset) = packing_vec::(m, k, n); - let app = make_aligned_vec_ptr(K::align_to(), &mut packv); - let bpp = app.offset(bp_offset); + let (mut packing_buffer, bp_offset) = make_packing_buffer::(m, k, n); + let app = packing_buffer.ptr_mut(); + let bpp = app.add(bp_offset); // LOOP 5: split n into nc parts for (l5, nc) in range_chunk(n, knc) { @@ -151,7 +153,6 @@ unsafe fn gemm_loop( dprint!("LOOP 4, {}, kc={}", l4, kc); let b = b.stride_offset(rsb, kkc * l4); let a = a.stride_offset(csa, kkc * l4); - debug!(for elt in &mut packv { *elt = <_>::one(); }); // Pack B -> B~ pack(kc, nc, K::nr(), bpp, b, csb, rsb); @@ -232,10 +233,12 @@ unsafe fn gemm_packed(nc: usize, kc: usize, mc: usize, /// but we can make them smaller if the matrix is smaller than this (just ensure /// we have rounded up to a multiple of the kernel size). /// -/// Return packing vector and offset to start of b -unsafe fn packing_vec(m: usize, k: usize, n: usize) -> (Vec, isize) +/// Return packing buffer and offset to start of b +unsafe fn make_packing_buffer(m: usize, k: usize, n: usize) -> (Alloc, usize) where K: GemmKernel, { + // max alignment requirement is a multiple of min(MR, NR) * sizeof + // because apack_size is a multiple of MR, start of b aligns fine let m = min(m, K::mc()); let k = min(k, K::kc()); let n = min(n, K::nc()); @@ -244,29 +247,13 @@ unsafe fn packing_vec(m: usize, k: usize, n: usize) -> (Vec, isize) let apack_size = k * round_up_to(m, K::mr()); let bpack_size = k * round_up_to(n, K::nr()); let nelem = apack_size + bpack_size; - let mut v = Vec::with_capacity(nelem); - v.set_len(nelem); + dprint!("packed nelem={}, apack={}, bpack={}, m={} k={} n={}", nelem, apack_size, bpack_size, m,k,n); - // max alignment requirement is a multiple of min(MR, NR) * sizeof - // because apack_size is a multiple of MR, start of b aligns fine - (v, apack_size as isize) -} -/// Align a pointer into the vec. Will reallocate to fit & shift the pointer -/// forwards if needed. This invalidates any previous pointers into the v. -unsafe fn make_aligned_vec_ptr(align_to: usize, v: &mut Vec) -> *mut U { - let mut ptr = v.as_mut_ptr(); - if align_to != 0 { - if v.as_ptr() as usize % align_to != 0 { - let cap = v.capacity(); - v.reserve_exact(cap + align_to / size_of::() - 1); - ptr = align_ptr(align_to, v.as_mut_ptr()); - } - } - ptr + (Alloc::new(nelem, K::align_to()), apack_size) } /// offset the ptr forwards to align to a specific byte count diff --git a/src/kernel.rs b/src/kernel.rs index 9a891a6..1d59dcd 100644 --- a/src/kernel.rs +++ b/src/kernel.rs @@ -1,4 +1,4 @@ -// Copyright 2016 bluss +// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/lib.rs b/src/lib.rs index 71e5afb..b87cfc1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -// Copyright 2016 bluss +// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -52,6 +52,7 @@ extern crate rawpointer; +#[macro_use] mod archmacros_x86; #[macro_use] mod debugmacros; #[macro_use] mod loopmacros; mod archparam; @@ -60,6 +61,7 @@ mod gemm; mod sgemm_kernel; mod dgemm_kernel; mod util; +mod aligned_alloc; pub use gemm::sgemm; pub use gemm::dgemm; diff --git a/src/loopmacros.rs b/src/loopmacros.rs index c13e0b1..8e40d81 100644 --- a/src/loopmacros.rs +++ b/src/loopmacros.rs @@ -1,4 +1,4 @@ -// Copyright 2016 bluss +// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -6,16 +6,6 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -macro_rules! if_cfg { - ($id:ident, $type_:ty, $x:expr, $y:expr) => {{ - #[cfg($id)] - const TMP: $type_ = $x; - #[cfg(not($id))] - const TMP: $type_ = $y; - TMP - }} -} - // Unroll only in non-debug builds #[cfg(not(debug_assertions))] @@ -79,3 +69,34 @@ macro_rules! unroll_by { } }} } + +#[cfg(debug_assertions)] +macro_rules! unroll_by_with_last { + ($by:tt => $ntimes:expr, $is_last:ident, $e:expr) => {{ + let k = $ntimes - 1; + let $is_last = false; + for _ in 0..k { + $e; + } + let $is_last = true; + #[allow(unused_assignments)] + $e; + }} +} + +#[cfg(not(debug_assertions))] +macro_rules! unroll_by_with_last { + ($by:tt => $ntimes:expr, $is_last:ident, $e:expr) => {{ + let k = $ntimes - 1; + let $is_last = false; + for _ in 0..k / $by { + repeat!($by $e); + } + for _ in 0..k % $by { + $e; + } + let $is_last = true; + #[allow(unused_assignments)] + $e; + }} +} diff --git a/src/sgemm_kernel.rs b/src/sgemm_kernel.rs index e223632..be0eefb 100644 --- a/src/sgemm_kernel.rs +++ b/src/sgemm_kernel.rs @@ -1,4 +1,4 @@ -// Copyright 2016 bluss +// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -9,25 +9,27 @@ use kernel::GemmKernel; use archparam; + +#[cfg(target_arch="x86")] +use std::arch::x86::*; +#[cfg(target_arch="x86_64")] +use std::arch::x86_64::*; + pub enum Gemm { } pub type T = f32; -const MR: usize = if_cfg!(sgemm_8x8, usize, 8, 4); +const MR: usize = 8; const NR: usize = 8; -#[cfg(sgemm_8x8)] macro_rules! loop_m { ($i:ident, $e:expr) => { loop8!($i, $e) }; } -#[cfg(not(sgemm_8x8))] -macro_rules! loop_m { ($i:ident, $e:expr) => { loop4!($i, $e) }; } - macro_rules! loop_n { ($j:ident, $e:expr) => { loop8!($j, $e) }; } impl GemmKernel for Gemm { type Elem = T; #[inline(always)] - fn align_to() -> usize { 0 } + fn align_to() -> usize { 32 } #[inline(always)] fn mr() -> usize { MR } @@ -35,7 +37,7 @@ impl GemmKernel for Gemm { fn nr() -> usize { NR } #[inline(always)] - fn always_masked() -> bool { true } + fn always_masked() -> bool { false } #[inline(always)] fn nc() -> usize { archparam::S_NC } @@ -68,19 +70,296 @@ impl GemmKernel for Gemm { /// + rsc: row stride of c /// + csc: col stride of c /// + if beta is 0, then c does not need to be initialized -#[inline(always)] +#[inline(never)] pub unsafe fn kernel(k: usize, alpha: T, a: *const T, b: *const T, beta: T, c: *mut T, rsc: isize, csc: isize) { - // using `uninitialized` is a workaround for issue https://github.com/bluss/matrixmultiply/issues/9 - let mut ab: [[T; NR]; MR] = ::std::mem::uninitialized(); + // dispatch to specific compiled versions + #[cfg(any(target_arch="x86", target_arch="x86_64"))] + { + if is_x86_feature_detected_!("avx") { + return kernel_target_avx(k, alpha, a, b, beta, c, rsc, csc); + } else if is_x86_feature_detected_!("sse2") { + return kernel_target_sse2(k, alpha, a, b, beta, c, rsc, csc); + } + } + kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc); +} + +#[inline] +#[target_feature(enable="avx")] +#[cfg(any(target_arch="x86", target_arch="x86_64"))] +unsafe fn kernel_target_avx(k: usize, alpha: T, a: *const T, b: *const T, + beta: T, c: *mut T, rsc: isize, csc: isize) +{ + kernel_x86_avx(k, alpha, a, b, beta, c, rsc, csc) +} + +#[inline] +#[target_feature(enable="sse2")] +#[cfg(any(target_arch="x86", target_arch="x86_64"))] +unsafe fn kernel_target_sse2(k: usize, alpha: T, a: *const T, b: *const T, + beta: T, c: *mut T, rsc: isize, csc: isize) +{ + kernel_fallback_impl(k, alpha, a, b, beta, c, rsc, csc) +} + +#[inline(always)] +#[cfg(any(target_arch="x86", target_arch="x86_64"))] +unsafe fn kernel_x86_avx(k: usize, alpha: T, a: *const T, b: *const T, + beta: T, c: *mut T, rsc: isize, csc: isize) +{ + debug_assert_ne!(k, 0); + + let mut ab = [_mm256_setzero_ps(); MR]; + + // this kernel can operate in either transposition (C = A B or C^T = B^T A^T) + const PREFER_ROW_MAJOR_C: bool = true; + + let (mut a, mut b) = if PREFER_ROW_MAJOR_C { (a, b) } else { (b, a) }; + let (rsc, csc) = if PREFER_ROW_MAJOR_C { (rsc, csc) } else { (csc, rsc) }; + + macro_rules! shuffle_mask { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + } + } + macro_rules! permute_mask { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + } + } + + macro_rules! permute2f128_mask { + ($y:expr, $x:expr) => { + (($y << 4) | $x) + } + } + + // Start data load before each iteration + let mut av = _mm256_load_ps(a); + let mut bv = _mm256_load_ps(b); + + // Compute A B + unroll_by_with_last!(4 => k, is_last, { + // We compute abij = ai bj + // + // Load b as one contiguous vector + // Load a as striped vectors + // + // Shuffle the abij elements in order after the loop. + // + // Note this scheme copied and transposed from the BLIS 8x8 sgemm + // microkernel. + // + // Our a indices are striped and our b indices are linear. In + // the variable names below, we always have doubled indices so + // for example a0246 corresponds to a vector of a0 a0 a2 a2 a4 a4 a6 a6. + // + // ab0246: ab2064: ab4602: ab6420: + // ( ab00 ( ab20 ( ab40 ( ab60 + // ab01 ab21 ab41 ab61 + // ab22 ab02 ab62 ab42 + // ab23 ab03 ab63 ab43 + // ab44 ab64 ab04 ab24 + // ab45 ab65 ab05 ab25 + // ab66 ab46 ab26 ab06 + // ab67 ) ab47 ) ab27 ) ab07 ) + // + // ab1357: ab3175: ab5713: ab7531: + // ( ab10 ( ab30 ( ab50 ( ab70 + // ab11 ab31 ab51 ab71 + // ab32 ab12 ab72 ab52 + // ab33 ab13 ab73 ab53 + // ab54 ab74 ab14 ab34 + // ab55 ab75 ab15 ab35 + // ab76 ab56 ab36 ab16 + // ab77 ) ab57 ) ab37 ) ab17 ) + + const PERM32_2301: i32 = permute_mask!(1, 0, 3, 2); + const PERM128_30: i32 = permute2f128_mask!(0, 3); + + // _mm256_moveldup_ps(av): + // vmovsldup ymm2, ymmword ptr [rax] + // + // Load and duplicate each even word: + // ymm2 ← [a0 a0 a2 a2 a4 a4 a6 a6] + // + // _mm256_movehdup_ps(av): + // vmovshdup ymm2, ymmword ptr [rax] + // + // Load and duplicate each odd word: + // ymm2 ← [a1 a1 a3 a3 a5 a5 a7 a7] + // + + let a0246 = _mm256_moveldup_ps(av); // Load: a0 a0 a2 a2 a4 a4 a6 a6 + let a2064 = _mm256_permute_ps(a0246, PERM32_2301); + + let a1357 = _mm256_movehdup_ps(av); // Load: a1 a1 a3 a3 a5 a5 a7 a7 + let a3175 = _mm256_permute_ps(a1357, PERM32_2301); + + let a4602 = _mm256_permute2f128_ps(a0246, a0246, PERM128_30); + let a6420 = _mm256_permute2f128_ps(a2064, a2064, PERM128_30); + + let a5713 = _mm256_permute2f128_ps(a1357, a1357, PERM128_30); + let a7531 = _mm256_permute2f128_ps(a3175, a3175, PERM128_30); + + ab[0] = _mm256_add_ps(ab[0], _mm256_mul_ps(a0246, bv)); + ab[1] = _mm256_add_ps(ab[1], _mm256_mul_ps(a2064, bv)); + ab[2] = _mm256_add_ps(ab[2], _mm256_mul_ps(a4602, bv)); + ab[3] = _mm256_add_ps(ab[3], _mm256_mul_ps(a6420, bv)); + + ab[4] = _mm256_add_ps(ab[4], _mm256_mul_ps(a1357, bv)); + ab[5] = _mm256_add_ps(ab[5], _mm256_mul_ps(a3175, bv)); + ab[6] = _mm256_add_ps(ab[6], _mm256_mul_ps(a5713, bv)); + ab[7] = _mm256_add_ps(ab[7], _mm256_mul_ps(a7531, bv)); + + if !is_last { + a = a.add(MR); + b = b.add(NR); + + bv = _mm256_load_ps(b); + av = _mm256_load_ps(a); + } + }); + + let alphav = _mm256_set1_ps(alpha); + + // Permute to put the abij elements in order + // + // shufps 0xe4: 22006644 00224466 -> 22226666 + // + // vperm2 0x30: 00004444 44440000 -> 00000000 + // vperm2 0x12: 00004444 44440000 -> 44444444 + // + + let ab0246 = ab[0]; + let ab2064 = ab[1]; + let ab4602 = ab[2]; + let ab6420 = ab[3]; + + let ab1357 = ab[4]; + let ab3175 = ab[5]; + let ab5713 = ab[6]; + let ab7531 = ab[7]; + + const SHUF_0123: i32 = shuffle_mask!(3, 2, 1, 0); + debug_assert_eq!(SHUF_0123, 0xE4); + + const PERM128_03: i32 = permute2f128_mask!(3, 0); + const PERM128_21: i32 = permute2f128_mask!(1, 2); + + // No elements are "shuffled" in truth, they all stay at their index + // but we combine vectors to de-stripe them. + // + // For example, the first shuffle below uses 0 1 2 3 which + // corresponds to the X0 X1 Y2 Y3 sequence etc: + // + // variable + // X ab00 ab01 ab22 ab23 ab44 ab45 ab66 ab67 ab0246 + // Y ab20 ab21 ab02 ab03 ab64 ab65 ab46 ab47 ab2064 + // + // X0 X1 Y2 Y3 X4 X5 Y6 Y7 + // = ab00 ab01 ab02 ab03 ab44 ab45 ab46 ab47 ab0044 + + let ab0044 = _mm256_shuffle_ps(ab0246, ab2064, SHUF_0123); + let ab2266 = _mm256_shuffle_ps(ab2064, ab0246, SHUF_0123); + + let ab4400 = _mm256_shuffle_ps(ab4602, ab6420, SHUF_0123); + let ab6622 = _mm256_shuffle_ps(ab6420, ab4602, SHUF_0123); + + let ab1155 = _mm256_shuffle_ps(ab1357, ab3175, SHUF_0123); + let ab3377 = _mm256_shuffle_ps(ab3175, ab1357, SHUF_0123); + + let ab5511 = _mm256_shuffle_ps(ab5713, ab7531, SHUF_0123); + let ab7733 = _mm256_shuffle_ps(ab7531, ab5713, SHUF_0123); + + let ab0000 = _mm256_permute2f128_ps(ab0044, ab4400, PERM128_03); + let ab4444 = _mm256_permute2f128_ps(ab0044, ab4400, PERM128_21); + + let ab2222 = _mm256_permute2f128_ps(ab2266, ab6622, PERM128_03); + let ab6666 = _mm256_permute2f128_ps(ab2266, ab6622, PERM128_21); + + let ab1111 = _mm256_permute2f128_ps(ab1155, ab5511, PERM128_03); + let ab5555 = _mm256_permute2f128_ps(ab1155, ab5511, PERM128_21); + + let ab3333 = _mm256_permute2f128_ps(ab3377, ab7733, PERM128_03); + let ab7777 = _mm256_permute2f128_ps(ab3377, ab7733, PERM128_21); + + ab[0] = ab0000; + ab[1] = ab1111; + ab[2] = ab2222; + ab[3] = ab3333; + ab[4] = ab4444; + ab[5] = ab5555; + ab[6] = ab6666; + ab[7] = ab7777; + + // Compute α (A B) + loop_m!(i, ab[i] = _mm256_mul_ps(alphav, ab[i])); + + macro_rules! c { + ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); + } + + // C ← α A B + β C + let mut c = [_mm256_setzero_ps(); MR]; + let betav = _mm256_set1_ps(beta); + if beta != 0. { + // Read C + if csc == 1 { + loop_m!(i, c[i] = _mm256_loadu_ps(c![i, 0])); + // Handle rsc == 1 case with transpose? + } else { + loop_m!(i, c[i] = _mm256_set_ps(*c![i, 7], *c![i, 6], *c![i, 5], *c![i, 4], *c![i, 3], *c![i, 2], *c![i, 1], *c![i, 0])); + } + // Compute β C + loop_m!(i, c[i] = _mm256_mul_ps(c[i], betav)); + } + + // Compute (α A B) + (β C) + loop_m!(i, c[i] = _mm256_add_ps(c[i], ab[i])); + + // Store C back to memory + if csc == 1 { + loop_m!(i, _mm256_storeu_ps(c![i, 0], c[i])); + // Handle rsc == 1 case with transpose? + } else { + // Permute to bring each element in the vector to the front and store + loop_m!(i, { + let clo = _mm256_extractf128_ps(c[i], 0); + let chi = _mm256_extractf128_ps(c[i], 1); + + _mm_store_ss(c![i, 0], clo); + let cperm = _mm_permute_ps(clo, permute_mask!(0, 3, 2, 1)); + _mm_store_ss(c![i, 1], cperm); + let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1)); + _mm_store_ss(c![i, 2], cperm); + let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1)); + _mm_store_ss(c![i, 3], cperm); + + + _mm_store_ss(c![i, 4], chi); + let cperm = _mm_permute_ps(chi, permute_mask!(0, 3, 2, 1)); + _mm_store_ss(c![i, 5], cperm); + let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1)); + _mm_store_ss(c![i, 6], cperm); + let cperm = _mm_permute_ps(cperm, permute_mask!(0, 3, 2, 1)); + _mm_store_ss(c![i, 7], cperm); + }); + } +} + +#[inline] +unsafe fn kernel_fallback_impl(k: usize, alpha: T, a: *const T, b: *const T, + beta: T, c: *mut T, rsc: isize, csc: isize) +{ + let mut ab: [[T; NR]; MR] = [[0.; NR]; MR]; let mut a = a; let mut b = b; - debug_assert_eq!(beta, 0.); // always masked - loop_m!(i, loop_n!(j, ab[i][j] = 0.)); - // Compute matrix multiplication into ab[i][j] - unroll_by!(5 => k, { + // Compute A B into ab[i][j] + unroll_by!(4 => k, { loop_m!(i, loop_n!(j, ab[i][j] += at(a, i) * at(b, j))); a = a.offset(MR as isize); @@ -91,8 +370,12 @@ pub unsafe fn kernel(k: usize, alpha: T, a: *const T, b: *const T, ($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize)); } - // set C = α A B - loop_n!(j, loop_m!(i, *c![i, j] = alpha * ab[i][j])); + // set C = α A B + β C + if beta == 0. { + loop_n!(j, loop_m!(i, *c![i, j] = alpha * ab[i][j])); + } else { + loop_n!(j, loop_m!(i, *c![i, j] = *c![i, j] * beta + alpha * ab[i][j])); + } } #[inline(always)] @@ -100,22 +383,82 @@ unsafe fn at(ptr: *const T, i: usize) -> T { *ptr.offset(i as isize) } -#[test] -fn test_gemm_kernel() { - let mut a = [1.; 16]; - let mut b = [0.; 32]; - for (i, x) in a.iter_mut().enumerate() { - *x = i as f32; +#[cfg(test)] +mod tests { + use super::*; + use aligned_alloc::Alloc; + + fn aligned_alloc(elt: T, n: usize) -> Alloc where T: Copy + { + unsafe { + Alloc::new(n, Gemm::align_to()).init_with(elt) + } + } + + use super::T; + type KernelFn = unsafe fn(usize, T, *const T, *const T, T, *mut T, isize, isize); + + fn test_a_kernel(_name: &str, kernel_fn: KernelFn) { + const K: usize = 4; + let mut a = aligned_alloc(1., MR * K); + let mut b = aligned_alloc(0., NR * K); + for (i, x) in a.iter_mut().enumerate() { + *x = i as _; + } + + for i in 0..K { + b[i + i * NR] = 1.; + } + let mut c = [0.; MR * NR]; + unsafe { + kernel_fn(K, 1., &a[0], &b[0], 0., &mut c[0], 1, MR as isize); + // col major C + } + assert_eq!(&a[..], &c[..a.len()]); } - for i in 0..4 { - b[i + i * 8] = 1.; + #[test] + fn test_native_kernel() { + test_a_kernel("kernel", kernel); } - let mut c = [0.; 32]; - unsafe { - kernel(4, 1., &a[0], &b[0], 0., &mut c[0], 1, 4); - // col major C + + #[test] + fn test_kernel_fallback_impl() { + test_a_kernel("kernel", kernel_fallback_impl); } - assert_eq!(&a, &c[..16]); -} + #[test] + fn test_loop_m_n() { + let mut m = [[0; NR]; MR]; + loop_m!(i, loop_n!(j, m[i][j] += 1)); + for arr in &m[..] { + for elt in &arr[..] { + assert_eq!(*elt, 1); + } + } + } + + mod test_arch_kernels { + use super::test_a_kernel; + macro_rules! test_arch_kernels_x86 { + ($($feature_name:tt, $function_name:ident),*) => { + $( + #[test] + fn $function_name() { + if is_x86_feature_detected_!($feature_name) { + test_a_kernel(stringify!($function_name), super::super::$function_name); + } else { + println!("Skipping, host does not have feature: {:?}", $feature_name); + } + } + )* + } + } + + #[cfg(any(target_arch="x86", target_arch="x86_64"))] + test_arch_kernels_x86! { + "avx", kernel_target_avx, + "sse2", kernel_target_sse2 + } + } +} diff --git a/src/util.rs b/src/util.rs index 5ddc84c..a3524b2 100644 --- a/src/util.rs +++ b/src/util.rs @@ -1,4 +1,4 @@ -// Copyright 2016 bluss +// Copyright 2016 - 2018 Ulrik Sverdrup "bluss" // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/tests/sgemm.rs b/tests/sgemm.rs index c297b5a..7137354 100644 --- a/tests/sgemm.rs +++ b/tests/sgemm.rs @@ -6,7 +6,7 @@ use std::fmt::{Display, Debug}; trait Float : Copy + Display + Debug + PartialEq { fn zero() -> Self; fn one() -> Self; - fn from(i64) -> Self; + fn from(x: i64) -> Self; fn nan() -> Self; }