Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use std::arch SIMD and runtime target feature detection #22

Merged
merged 51 commits into from
Nov 15, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
b6b089a
FEAT: Full 4x4 sse gemm kernel (only supports masked)
bluss Nov 13, 2018
fefad57
FEAT: Complete unmasked 4x4 sgemm kernel
bluss Nov 13, 2018
1eb4ec9
FIX: sgemm: Use whole vector load/stores of C if strides allow
bluss Nov 13, 2018
6907d2f
Dispatch to specialized versions at runtime (avx vs sse on x86-* plat…
bluss Nov 13, 2018
51f0e23
FEAT: sgemm: add back generic kernel fallback and fix test
bluss Nov 13, 2018
c8ffe7b
FIX: Request 16-byte alignment for simd sgemm and use aligned load
bluss Nov 13, 2018
186d015
MAINT: Edit travis for multiarch builds
bluss Nov 13, 2018
4c4f3a8
MAINT: Add test for loop_m/loop_n correctness
bluss Nov 13, 2018
c50b77c
FIX: Remove uninitialized in sgemm fallback kernel
bluss Nov 13, 2018
c73c895
FEAT: Write sse kernel using arrays
bluss Nov 13, 2018
68b5b16
FEAT: add avx 8x8 sgemm kernel
bluss Nov 13, 2018
5fa96e8
FEAT: avx sgemm: use "striped" vectors ported from BLIS
bluss Nov 13, 2018
770da12
FEAT: sgemm: move shuffle and permutation masks to macros and constants
bluss Nov 13, 2018
80ed600
FEAT: for dgemm, enable avx code generation of the fallback impl
bluss Nov 13, 2018
8bd4d9e
FIX: Use inline on the sgemm fallback impl
bluss Nov 13, 2018
957ec0b
FEAT: Use alloc api for aligned alloc
bluss Nov 13, 2018
d5a477f
FIX: Remove redundant parantheses in macro
bluss Nov 13, 2018
bccd99f
FEAT: sgemm: Update store C to match what blis kernel does
bluss Nov 13, 2018
8da0c5e
FIX: sgemm make the main kernel function #[inline]
bluss Nov 13, 2018
2904174
FIX: sgemm un-pub internal functions
bluss Nov 13, 2018
3d9fded
TEST: Implement aligned alloc so we can test with 32-byte align
bluss Nov 13, 2018
858ed46
TEST: Add a way to disable a specific target feature's detection
bluss Nov 13, 2018
e049912
FEAT: Add aligned allocation wrapper
bluss Nov 13, 2018
c3cfd80
FEAT: Use aligned alloc for gemm
bluss Nov 13, 2018
dfd4148
TEST: Update sgemm_kernel tests to use the new Alloc
bluss Nov 13, 2018
b85879a
FEAT: in sgemm, move sse kernel aside
bluss Nov 13, 2018
54b6642
FEAT: in sgemm/dgemm add case for the sse2 target feature
bluss Nov 13, 2018
777fd75
FIX: Use the is_x86_target_feature_detected shim in dgemm too
bluss Nov 13, 2018
c579ee9
FIX: Remove uninitialized in dgemm fallback kernel
bluss Nov 13, 2018
62b937f
FEAT: Remove now-unused build script
bluss Nov 13, 2018
8a5a6b8
FIX: Set inline(never) on the sgemm kernel
bluss Nov 13, 2018
dce0762
FEAT: Clean up and improve comments in sgemm avx kernel
bluss Nov 13, 2018
0aa7e8a
FIX: Add de-striping comment to sgemm
bluss Nov 13, 2018
ad51677
FEAT: Add code that shows we can transpose the sgemm avx kernel
bluss Nov 13, 2018
955c32b
MAINT: Build in travis from Rust 1.28.0
bluss Nov 13, 2018
10e578f
TEST: Reduce number of benchmarks
bluss Nov 13, 2018
a9e62b3
FEAT: sgemm use unaligned loads
bluss Nov 13, 2018
f1ac534
FIX: In aligned alloc, use main alloc/dealloc functions
bluss Nov 13, 2018
157b166
FIX: in sgemm, move the α * (AB) multiplication down
bluss Nov 13, 2018
76d2806
FIX: Fixup debug!() statements, add one for packing buffer allocation
bluss Nov 13, 2018
bf57a21
FIX: In Alloc, rename parameter to nelem
bluss Nov 13, 2018
2202095
FIX: Pass number of elements (not bytes) when making a packing buffer
bluss Nov 13, 2018
7be5085
FIX: Remove unused imports in aligned_alloc.rs
bluss Nov 13, 2018
8a478da
FEAT: Schedule loads an iteration ahead in avx kernel
bluss Nov 13, 2018
69ddcfd
FIX: sgemm use aligned loads
bluss Nov 13, 2018
a537a03
FIX: aligned_alloc: don't use implicit `std`
bluss Nov 13, 2018
fdc80e6
FIX: Fix trait definition with missing parameter
bluss Nov 13, 2018
b5cc042
TEST: Make sure all kernels are tested
bluss Nov 13, 2018
4dad63f
TEST: Add internal crate for benchmarks vs Openblas
bluss Nov 15, 2018
6364fac
MAINT: Update copyright headers
bluss Nov 15, 2018
58b8683
TEST: Use "blas" crate for blas-bench
bluss Nov 15, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 39 additions & 11 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,21 +1,49 @@
language: rust
sudo: false

# run builds for all the trains (and more)
rust:
- 1.12.0
- stable
- beta
- nightly
matrix:
include:
- rust: 1.28.0
env:
TARGET=x86_64-unknown-linux-gnu
- rust: stable
env:
TARGET=x86_64-unknown-linux-gnu
- rust: stable
env:
TARGET=i686-unknown-linux-gnu
- rust: beta
env:
TARGET=x86_64-unknown-linux-gnu
- rust: nightly
env:
TARGET=x86_64-unknown-linux-gnu
- rust: nightly
env:
TARGET=aarch64-unknown-linux-gnu
BUILD_ONLY=1
env:
global:
- HOST=x86_64-unknown-linux-gnu

addons:
apt:
packages:
# needed for i686-unknown-linux-gnu target
- gcc-multilib
install:
# "rustup error: cannot re-add" without this conditional check
- if [[ $HOST != $TARGET ]]; then rustup target add $TARGET; fi

# the main build
script:
- |
cargo build &&
cargo test &&
cargo test --release &&
cargo doc &&
cargo bench
cargo build --target=$TARGET &&
([ -n "$BUILD_ONLY" ] || (
cargo test --target=$TARGET &&
cargo test --release --target=$TARGET &&
cargo doc --target=$TARGET &&
cargo bench --target=$TARGET ))

branches:
only:
Expand Down
2 changes: 0 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@ description = "General matrix multiplication of f32 and f64 matrices in Rust. Su

keywords = ["matrix", "sgemm", "dgemm"]

build = "build.rs"

[lib]
bench = false

Expand Down
2 changes: 1 addition & 1 deletion LICENSE-MIT
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright (c) 2015
Copyright (c) 2016 - 2018 Ulrik Sverdrup "bluss"

Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
Expand Down
11 changes: 6 additions & 5 deletions benches/benchmarks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,43 +40,44 @@ macro_rules! mat_mul {
};
}

benchmark_main!(mat_mul_f32, mat_mul_f64, ref_mat_mul_f32);
benchmark_main!(mat_mul_f32, mat_mul_f64);

mat_mul!{mat_mul_f32, sgemm,
(m004, 4, 4, 4)
(m005, 5, 5, 5)
(m006, 6, 6, 6)
(m007, 7, 7, 7)
(m008, 8, 8, 8)
(m009, 9, 9, 9)
(m012, 12, 12, 12)
(m016, 16, 16, 16)
(m032, 32, 32, 32)
(m064, 64, 64, 64)
(m127, 127, 127, 127)
/*
(m256, 256, 256, 256)
(m512, 512, 512, 512)
(mix16x4, 32, 4, 32)
(mix32x2, 32, 2, 32)
(mix97, 97, 97, 125)
(mix128x10000x128, 128, 10000, 128)
*/
}

mat_mul!{mat_mul_f64, dgemm,
(m004, 4, 4, 4)
(m007, 7, 7, 7)
(m006, 6, 6, 6)
(m008, 8, 8, 8)
(m012, 12, 12, 12)
(m016, 16, 16, 16)
(m032, 32, 32, 32)
(m064, 64, 64, 64)
(m127, 127, 127, 127)
/*
(m256, 256, 256, 256)
(m512, 512, 512, 512)
(mix16x4, 32, 4, 32)
(mix32x2, 32, 2, 32)
(mix97, 97, 97, 125)
(mix128x10000x128, 128, 10000, 128)
*/
}

use std::ops::{Add, Mul};
Expand Down
31 changes: 31 additions & 0 deletions blas-bench/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
[package]
name = "blas-bench"
version = "0.1.0"
authors = ["bluss"]
publish = false

license = "MIT/Apache-2.0"

repository = "https://github.com/bluss/matrixmultiply/"
documentation = ""

description = "Blas benchmarks for comparison with matrixmultiply"

keywords = ["matrix", "sgemm", "dgemm"]

[lib]
bench = false

[[bench]]
name = "benchmarks"
harness = false

[dependencies]
rawpointer = "0.1"
matrixmultiply = { path = ".." }
blas = { version = "0.20", default-features = false }
blas-src = { version = "0.2.0", default-features = false }


[dev-dependencies]
bencher = "0.1.2"
7 changes: 7 additions & 0 deletions blas-bench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

Run BLAS benchmarks to compare with matrixmultiply.

These tests are set up to run vs a system-installed openblas (see the build.rs file),
because building all of openblas just to benchmark versus it is tedious.
So make sure openblas is installed, or other library that supports the cblas interface,
and tweak the build.rs file to suit.
81 changes: 81 additions & 0 deletions blas-bench/benches/benchmarks.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
extern crate blas_bench;
extern crate matrixmultiply;
pub use matrixmultiply::sgemm;
pub use matrixmultiply::dgemm;

#[macro_use]
extern crate bencher;
extern crate blas;

use std::os::raw::c_int;


#[allow(non_camel_case_types)]
type blas_index = c_int; // blas index type


// Compute GFlop/s
// by flop / s = 2 M N K / time


benchmark_main!(blas_mat_mul_f32, blas_mat_mul_f64);

macro_rules! blas_mat_mul {
($modname:ident, $gemm:ident, $(($name:ident, $m:expr, $n:expr, $k:expr))+) => {
mod $modname {
use bencher::{Bencher};
use super::blas_index;
$(
pub fn $name(bench: &mut Bencher)
{
let a = vec![0.; $m * $n];
let b = vec![0.; $n * $k];
let mut c = vec![0.; $m * $k];
bench.iter(|| {
unsafe {

blas::$gemm(
b'N',
b'N',
$m as blas_index, // m, rows of Op(a)
$n as blas_index, // n, cols of Op(b)
$k as blas_index, // k, cols of Op(a)
1.,
&a,
$n, // lda
&b,
$k, // ldb
0., // beta
&mut c,
$k, // ldc
);
}
});
}
)+
}
benchmark_group!{ $modname, $($modname::$name),+ }
};
}

blas_mat_mul!{blas_mat_mul_f32, sgemm,
(m004, 4, 4, 4)
(m006, 6, 6, 6)
(m008, 8, 8, 8)
(m012, 12, 12, 12)
(m016, 16, 16, 16)
(m032, 32, 32, 32)
(m064, 64, 64, 64)
(m127, 127, 127, 127)
}

blas_mat_mul!{blas_mat_mul_f64, dgemm,
(m004, 4, 4, 4)
(m006, 6, 6, 6)
(m008, 8, 8, 8)
(m012, 12, 12, 12)
(m016, 16, 16, 16)
(m032, 32, 32, 32)
(m064, 64, 64, 64)
(m127, 127, 127, 127)
}
12 changes: 12 additions & 0 deletions blas-bench/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

///
/// This build script emits the openblas linking directive if requested
///

fn main() {
// Always linking openblas
// Compiling blas just for testing is tedious -- install it on your system
// and run this.
println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rustc-link-lib={}=openblas", "dylib");
}
7 changes: 7 additions & 0 deletions blas-bench/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#[cfg(test)]
mod tests {
#[test]
fn it_works() {
assert_eq!(2 + 2, 4);
}
}
14 changes: 0 additions & 14 deletions build.rs

This file was deleted.

84 changes: 84 additions & 0 deletions spare kernels/x86_sse_sgemm.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@

// 4x4 sse sgemm
macro_rules! mm_transpose4 {
($c0:expr, $c1:expr, $c2:expr, $c3:expr) => {{
// This is _MM_TRANSPOSE4_PS except we take variables, not references
let tmp0 = _mm_unpacklo_ps($c0, $c1);
let tmp2 = _mm_unpacklo_ps($c2, $c3);
let tmp1 = _mm_unpackhi_ps($c0, $c1);
let tmp3 = _mm_unpackhi_ps($c2, $c3);

$c0 = _mm_movelh_ps(tmp0, tmp2);
$c1 = _mm_movehl_ps(tmp2, tmp0);
$c2 = _mm_movelh_ps(tmp1, tmp3);
$c3 = _mm_movehl_ps(tmp3, tmp1);
}}
}

#[inline(always)]
#[cfg(any(target_arch="x86", target_arch="x86_64"))]
unsafe fn kernel_x86_sse(k: usize, alpha: T, a: *const T, b: *const T,
beta: T, c: *mut T, rsc: isize, csc: isize)
{
let mut ab = [_mm_setzero_ps(); MR];

let mut bv;
let (mut a, mut b) = (a, b);

// Compute A B
for _ in 0..k {
bv = _mm_load_ps(b as _); // aligned due to GemmKernel::align_to

loop_m!(i, {
// Compute ab_i += [ai b_j+0, ai b_j+1, ai b_j+2, ai b_j+3]
let aiv = _mm_set1_ps(at(a, i));
ab[i] = _mm_add_ps(ab[i], _mm_mul_ps(aiv, bv));
});

a = a.add(MR);
b = b.add(NR);
}

// Compute α (A B)
let alphav = _mm_set1_ps(alpha);
loop_m!(i, ab[i] = _mm_mul_ps(alphav, ab[i]));

macro_rules! c {
($i:expr, $j:expr) => (c.offset(rsc * $i as isize + csc * $j as isize));
}

// C ← α A B + β C
let mut c = [_mm_setzero_ps(); MR];
let betav = _mm_set1_ps(beta);
if beta != 0. {
// Read C
if csc == 1 {
loop_m!(i, c[i] = _mm_loadu_ps(c![i, 0]));
} else if rsc == 1 {
loop_m!(i, c[i] = _mm_loadu_ps(c![0, i]));
mm_transpose4!(c[0], c[1], c[2], c[3]);
} else {
loop_m!(i, c[i] = _mm_set_ps(*c![i, 3], *c![i, 2], *c![i, 1], *c![i, 0]));
}
// Compute β C
loop_m!(i, c[i] = _mm_mul_ps(c[i], betav));
}

// Compute (α A B) + (β C)
loop_m!(i, c[i] = _mm_add_ps(c[i], ab[i]));

// Store C back to memory
if csc == 1 {
loop_m!(i, _mm_storeu_ps(c![i, 0], c[i]));
} else if rsc == 1 {
mm_transpose4!(c[0], c[1], c[2], c[3]);
loop_m!(i, _mm_storeu_ps(c![0, i], c[i]));
} else {
// extract the nth value of a vector using _mm_cvtss_f32 (extract lowest)
// in combination with shuffle (move nth value to first position)
loop_m!(i, *c![i, 0] = _mm_cvtss_f32(c[i]));
loop_m!(i, *c![i, 1] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 1)));
loop_m!(i, *c![i, 2] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 2)));
loop_m!(i, *c![i, 3] = _mm_cvtss_f32(_mm_shuffle_ps(c[i], c[i], 3)));
}
}
Loading