From 1c768ee58d03bf52d695742810c1e12aded78c37 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Tue, 21 Mar 2023 23:31:31 +0100 Subject: [PATCH 01/25] beginning activation vm --- linalg/activations/Cargo.toml | 12 +++ linalg/activations/src/main.rs | 191 +++++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 linalg/activations/Cargo.toml create mode 100644 linalg/activations/src/main.rs diff --git a/linalg/activations/Cargo.toml b/linalg/activations/Cargo.toml new file mode 100644 index 0000000000..bf99d8ac22 --- /dev/null +++ b/linalg/activations/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "activations" +version = "0.1.0" +edition = "2021" + +[workspace] +members = [] + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dev-dependencies] +proptest = "1.1.0" diff --git a/linalg/activations/src/main.rs b/linalg/activations/src/main.rs new file mode 100644 index 0000000000..cf9257d614 --- /dev/null +++ b/linalg/activations/src/main.rs @@ -0,0 +1,191 @@ +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum RegisterId { + A = 0, + B = 1, + C = 2, + D = 3, +} + +type ConstantId = usize; + +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Op { + Move(RegisterId, RegisterId), + Load(RegisterId, ConstantId), + Abs, + Recip, + Add, + Sub, + Mul, + Min, + Max, + AddConst(ConstantId), + SubConst(ConstantId), + MulConst(ConstantId), + MinConst(ConstantId), + MaxConst(ConstantId), + IfPosTE, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct Program { + ops: Vec, + csts: Vec, +} + +impl Program { + fn compute(&self, x: f32) -> f32 { + let mut regs = [0f32; 4]; + regs[0] = x; + let mut constants = self.csts.clone(); + constants.insert(0, 0f32); + constants.insert(1, 1f32); + for op in &self.ops { + match op { + Op::Move(dst, src) => regs[*dst as usize] = regs[*src as usize], + Op::Load(dst, cst) => regs[*dst as usize] = constants[*cst], + Op::Abs => regs[0] = regs[0].abs(), + Op::Recip => regs[0] = regs[0].recip(), + Op::Add => regs[0] = regs[0] + regs[1], + Op::Sub => regs[0] = regs[0] - regs[1], + Op::Mul => regs[0] = regs[0] * regs[1], + Op::Min => regs[0] = regs[0].min(regs[1]), + Op::Max => regs[0] = regs[0].max(regs[1]), + Op::AddConst(cst) => regs[0] = regs[0] + constants[*cst], + Op::SubConst(cst) => regs[0] = regs[0] - constants[*cst], + Op::MulConst(cst) => regs[0] = regs[0] * constants[*cst], + Op::MinConst(cst) => regs[0] = regs[0].min(constants[*cst]), + Op::MaxConst(cst) => regs[0] = regs[0].max(constants[*cst]), + Op::IfPosTE => regs[0] = if regs[0] >= 0f32 { regs[1] } else { regs[2] }, + } + } + regs[0] + } +} + +mod definitions { + use super::Op::*; + use super::RegisterId::*; + use super::*; + + pub fn relu() -> Program { + Program { ops: vec![MaxConst(0)], csts: vec![] } + } + + pub fn affine(alpha: f32, beta: f32) -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + MulConst(2), + AddConst(3), + ], + csts: vec![alpha, beta], + } + } + + pub fn leaky_relu(alpha: f32) -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B,A), + MulConst(2), + Move(C,A), + Move(A,B), + IfPosTE, + ], + csts: vec![alpha], + } + } + + pub fn threshold_relu(alpha: f32) -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B,A), + SubConst(2), + Load(C,0), + IfPosTE, + ], + csts: vec![alpha], + } + } + + pub fn softsign() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B,A), + Abs, + AddConst(1), + Recip, + Mul, + ], + csts: vec![], + } + } + + pub fn hardswish() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B, A), + MulConst(2), + AddConst(3), + MinConst(1), + MaxConst(0), + Mul, + ], + csts: vec![1f32 / 6., 0.5], + } + } +} + +#[cfg(test)] +mod test { + use proptest::prelude::*; + + fn close_enough(a: f32, b: f32) -> bool { + fn max(a: f32, b: f32) -> f32 { + if a < b { + b + } else { + a + } + } + let rtol = 1e-05; + let atol = 1e-08; + return (a - b).abs() <= max(rtol * max(a.abs(), b.abs()), atol); + } + + proptest! { + #[test] + fn test_relu(x in any::()) { + prop_assert_eq!(super::definitions::relu().compute(x), x.max(0f32)) + } + + #[test] + fn test_affine(x in any::(), alpha in any::(), beta in any::()) { + prop_assert_eq!(super::definitions::affine(alpha, beta).compute(x), alpha * x + beta) + } + + #[test] + fn test_leaky_relu(x in any::(), alpha in any::()) { + prop_assert_eq!(super::definitions::leaky_relu(alpha).compute(x), if x > 0f32 { x } else { alpha * x }); + } + + #[test] + fn test_threshold_relu(x in any::(), alpha in any::()) { + prop_assert_eq!(super::definitions::threshold_relu(alpha).compute(x), if x >= alpha { x } else { 0f32 } ); + } + + #[test] + fn test_subsign(x in any::()) { + prop_assert!(close_enough(super::definitions::softsign().compute(x), x / (1.+x.abs()))); + } + + #[test] + fn test_hardswish(x in any::()) { + prop_assert!(close_enough(super::definitions::hardswish().compute(x), x * 0f32.max( 1f32.min((1./6.) * x + 0.5)))); + } + } +} From 7e2cb7cddd12d92665ba87801140b4d57613abc3 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Tue, 28 Mar 2023 23:31:22 +0200 Subject: [PATCH 02/25] wip, sigmoid and exp functions --- linalg/activations/src/main.rs | 192 +++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) diff --git a/linalg/activations/src/main.rs b/linalg/activations/src/main.rs index cf9257d614..b32b5e20f5 100644 --- a/linalg/activations/src/main.rs +++ b/linalg/activations/src/main.rs @@ -24,7 +24,11 @@ pub enum Op { MulConst(ConstantId), MinConst(ConstantId), MaxConst(ConstantId), + FMA(ConstantId), // a <- a * b + cst IfPosTE, + SwapBC, + Floor, + TwoPowOfInt, } #[derive(Clone, Debug, PartialEq)] @@ -57,6 +61,12 @@ impl Program { Op::MinConst(cst) => regs[0] = regs[0].min(constants[*cst]), Op::MaxConst(cst) => regs[0] = regs[0].max(constants[*cst]), Op::IfPosTE => regs[0] = if regs[0] >= 0f32 { regs[1] } else { regs[2] }, + Op::FMA(cst) => regs[0] = regs[0] * regs[1] + constants[*cst], + Op::SwapBC => regs.swap(1, 2), + Op::Floor => regs[0] = regs[0].floor(), + Op::TwoPowOfInt => { + regs[0] = f32::from_bits((((regs[0] as i32) + 127) as u32) << 23) + } } } regs[0] @@ -138,12 +148,103 @@ mod definitions { csts: vec![1f32 / 6., 0.5], } } + + pub fn sigmoid() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + MinConst(3), + MaxConst(2), + Move(B, A), // b = x + Move(C, A), // c = x + Mul, // a = x2 + Move(B, A), // b = x2 + MulConst(4), + AddConst(5), // a = x2 * a13 + a11 + FMA(6), + FMA(7), + FMA(8), + FMA(9), + FMA(10), + SwapBC, // c = x2, b = x + Mul, // a = p(x) + Move(B, C), // b = x2 + Move(C, A), // c = p(x) + Move(A, B), // a = x2 + MulConst(11), + AddConst(12), + FMA(13), + FMA(1), // a = q(x) + Recip, + Move(B,C), // b = p(x) + Mul, + AddConst(14) + ], + csts: vec![ + -18.6, // const 2 + 18.6, // const 3 + -4.433153405e-18, // const 4, also alpha_13 + 1.169974371e-14, // const 5, also a11 + -1.875289645e-11, + 4.257889523e-8, + 0.00004811817576, // const 8 + 0.008163842030, + 0.2499999971, // alpha_1 + 3.922935744e-6, // beta_6 + 0.001524872358, // const 12 + 0.1159886749, + 0.5, //beta_0 + ], + } + } + + pub fn exp2f() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + MinConst(2), + MaxConst(3), + Move(B, A), // b = x + AddConst(4), // a = x + 0.5 + Floor, // a = ipart + Move(C, A), // c = ipart + Move(A, B), // a = x + Move(B, C), // b = ipart + Sub, // a = fpart + Move(B, A), // b = fpart + Load(A, 5), // a = exp2p[0] + FMA(6), + FMA(7), + FMA(8), + FMA(9), + FMA(10), + FMA(1), // a = y + Move(B, A), + Move(A, C), + TwoPowOfInt, + Mul + ], + csts: vec![ + 127f32, + -127f32, + 0.5, + 1.535336188319500e-4, + 1.339887440266574e-3, + 9.618437357674640e-3, + 5.550332471162809e-2, + 2.402264791363012e-1, + 6.931472028550421e-1, + ], + } + } } #[cfg(test)] mod test { use proptest::prelude::*; + use crate::{exp2f, ssigmoid}; + fn close_enough(a: f32, b: f32) -> bool { fn max(a: f32, b: f32) -> f32 { if a < b { @@ -187,5 +288,96 @@ mod test { fn test_hardswish(x in any::()) { prop_assert!(close_enough(super::definitions::hardswish().compute(x), x * 0f32.max( 1f32.min((1./6.) * x + 0.5)))); } + + #[test] + fn test_sigmoid(x in any::()) { + prop_assert!(close_enough(super::definitions::sigmoid().compute(x), ssigmoid(x))); + } + + #[test] + fn test_ref_exp2f(x in any::()) { + prop_assert!(close_enough(exp2f(x), 2f32.powf(x))); + } + #[test] + fn test_cm_exp2f(x in any::()) { + prop_assert!(close_enough(super::definitions::exp2f().compute(x), exp2f(x))); + } } } + +pub fn ssigmoid(x: f32) -> f32 { + const LOW: f32 = -18.6; + const HIGH: f32 = -LOW; + + const ALPHA_13: f32 = -4.433153405e-18; + const ALPHA_11: f32 = 1.169974371e-14; + const ALPHA_9: f32 = -1.875289645e-11; + const ALPHA_7: f32 = 4.257889523e-8; + const ALPHA_5: f32 = 0.00004811817576; + const ALPHA_3: f32 = 0.008163842030; + const ALPHA_1: f32 = 0.2499999971; + const BETA_6: f32 = 3.922935744e-6; + const BETA_4: f32 = 0.001524872358; + const BETA_2: f32 = 0.1159886749; + const BETA_0: f32 = 1.0; + + let x = x.clamp(LOW, HIGH); + + let x2 = x * x; + + let p = ALPHA_13; + let p = x2 * p + ALPHA_11; + let p = x2 * p + ALPHA_9; + let p = x2 * p + ALPHA_7; + let p = x2 * p + ALPHA_5; + let p = x2 * p + ALPHA_3; + let p = x2 * p + ALPHA_1; + // a=p, b=x2, c=x + // swap(b,c) + // a=p, b=x, c=x2 + // mul + let p = p * x; + + // a=p, b=x, c=x2 + // mov(b, c) b = x2 + // mov(c, a) a = p + let q = BETA_6; + let q = x2 * q + BETA_4; + let q = x2 * q + BETA_2; + let q = x2 * q + BETA_0; + dbg!(p, q); + + p / q + 0.5 +} + +pub fn exp2f(x: f32) -> f32 { + const EXP2P: [f32; 7] = [ + 1.535336188319500e-4, + 1.339887440266574e-3, + 9.618437357674640e-3, + 5.550332471162809e-2, + 2.402264791363012e-1, + 6.931472028550421e-1, + 1.000000000000000, + ]; + + let x = x.min(127f32).max(-127f32); + + let ipart = (x + 0.5).floor(); + let fpart = x - ipart; + + // 2^ipart + let two_pow_ipart = f32::from_bits((((ipart as i32) + 127) as u32) << 23); + + let mut y = EXP2P[0]; + y = y * fpart + EXP2P[1]; + y = y * fpart + EXP2P[2]; + y = y * fpart + EXP2P[3]; + y = y * fpart + EXP2P[4]; + y = y * fpart + EXP2P[5]; + y = y * fpart + EXP2P[6]; +dbg!(y, two_pow_ipart); + y * two_pow_ipart +} + +fn main() {} From bb8e887df5b32acb199b062267a81ea678ab43ca Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 3 Apr 2023 20:43:08 +0200 Subject: [PATCH 03/25] cleanup --- linalg/activations/src/main.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/linalg/activations/src/main.rs b/linalg/activations/src/main.rs index b32b5e20f5..72c862b081 100644 --- a/linalg/activations/src/main.rs +++ b/linalg/activations/src/main.rs @@ -376,7 +376,6 @@ pub fn exp2f(x: f32) -> f32 { y = y * fpart + EXP2P[4]; y = y * fpart + EXP2P[5]; y = y * fpart + EXP2P[6]; -dbg!(y, two_pow_ipart); y * two_pow_ipart } From 2d2c360f3a56e77a3284c6f5201439c4d539054b Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 3 Apr 2023 21:29:10 +0200 Subject: [PATCH 04/25] benches first commit --- linalg/activations/Cargo.toml | 8 + linalg/activations/benches/vm.rs | 35 +++ linalg/activations/src/{main.rs => lib.rs} | 246 ++++++++++++--------- 3 files changed, 184 insertions(+), 105 deletions(-) create mode 100644 linalg/activations/benches/vm.rs rename linalg/activations/src/{main.rs => lib.rs} (67%) diff --git a/linalg/activations/Cargo.toml b/linalg/activations/Cargo.toml index bf99d8ac22..348f099e6b 100644 --- a/linalg/activations/Cargo.toml +++ b/linalg/activations/Cargo.toml @@ -9,4 +9,12 @@ members = [] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dev-dependencies] +criterion = "0.4.0" proptest = "1.1.0" + +[dependencies] + + +[[bench]] +name = "vm" +harness = false diff --git a/linalg/activations/benches/vm.rs b/linalg/activations/benches/vm.rs new file mode 100644 index 0000000000..14c226623b --- /dev/null +++ b/linalg/activations/benches/vm.rs @@ -0,0 +1,35 @@ +use activations::{definitions, reference}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, BatchSize}; + +pub fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("relu"); + for size in [1i32, 32, 256, 1024, 8192].iter() { + group.throughput(criterion::Throughput::Elements(*size as u64)); + group.bench_with_input(BenchmarkId::new("Reference", size), size, |b, size| { + b.iter_batched( + || vec![1.0f32; *size as usize], + |v| { + for x in v { + reference::relu(black_box(x)); + } + }, + BatchSize::LargeInput + ) + }); + let d = definitions::relu(); + group.bench_with_input(BenchmarkId::new("VM", size), size, |b, size| { + b.iter_batched( + || vec![1.0f32; *size as usize], + |v| { + for x in v { + d.compute(black_box(x)); + } + }, + BatchSize::LargeInput + ) + }); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/linalg/activations/src/main.rs b/linalg/activations/src/lib.rs similarity index 67% rename from linalg/activations/src/main.rs rename to linalg/activations/src/lib.rs index 72c862b081..d826500bbb 100644 --- a/linalg/activations/src/main.rs +++ b/linalg/activations/src/lib.rs @@ -38,7 +38,7 @@ pub struct Program { } impl Program { - fn compute(&self, x: f32) -> f32 { + pub fn compute(&self, x: f32) -> f32 { let mut regs = [0f32; 4]; regs[0] = x; let mut constants = self.csts.clone(); @@ -73,7 +73,7 @@ impl Program { } } -mod definitions { +pub mod definitions { use super::Op::*; use super::RegisterId::*; use super::*; @@ -223,7 +223,7 @@ mod definitions { Move(A, C), TwoPowOfInt, Mul - ], + ], csts: vec![ 127f32, -127f32, @@ -239,12 +239,122 @@ mod definitions { } } +pub mod reference { + pub fn relu(x: f32) -> f32 { + x.max(0f32) + } + + pub fn affine(x: f32, alpha: f32, beta: f32) -> f32 { + alpha * x + beta + } + + pub fn leaky_relu(x: f32, alpha: f32) -> f32 { + if x > 0f32 { + x + } else { + alpha * x + } + } + + pub fn threshold_relu(x: f32, alpha: f32) -> f32 { + if x >= alpha { + x + } else { + 0f32 + } + } + + pub fn subsign(x: f32) -> f32 { + x / (1. + x.abs()) + } + + pub fn hardswish(x: f32) -> f32 { + x * 0f32.max(1f32.min((1. / 6.) * x + 0.5)) + } + + pub fn sigmoid(x: f32) -> f32 { + ssigmoid(x) + } + + pub fn ref_exp2f(x: f32) -> f32 { + 2f32.powf(x) + } + + pub fn cm_exp2f(x: f32) -> f32 { + exp2f(x) + } + + fn ssigmoid(x: f32) -> f32 { + const LOW: f32 = -18.6; + const HIGH: f32 = -LOW; + + const ALPHA_13: f32 = -4.433153405e-18; + const ALPHA_11: f32 = 1.169974371e-14; + const ALPHA_9: f32 = -1.875289645e-11; + const ALPHA_7: f32 = 4.257889523e-8; + const ALPHA_5: f32 = 0.00004811817576; + const ALPHA_3: f32 = 0.008163842030; + const ALPHA_1: f32 = 0.2499999971; + const BETA_6: f32 = 3.922935744e-6; + const BETA_4: f32 = 0.001524872358; + const BETA_2: f32 = 0.1159886749; + const BETA_0: f32 = 1.0; + + let x = x.clamp(LOW, HIGH); + + let x2 = x * x; + + let p = ALPHA_13; + let p = x2 * p + ALPHA_11; + let p = x2 * p + ALPHA_9; + let p = x2 * p + ALPHA_7; + let p = x2 * p + ALPHA_5; + let p = x2 * p + ALPHA_3; + let p = x2 * p + ALPHA_1; + let p = p * x; + + let q = BETA_6; + let q = x2 * q + BETA_4; + let q = x2 * q + BETA_2; + let q = x2 * q + BETA_0; + + p / q + 0.5 + } + + pub fn exp2f(x: f32) -> f32 { + const EXP2P: [f32; 7] = [ + 1.535336188319500e-4, + 1.339887440266574e-3, + 9.618437357674640e-3, + 5.550332471162809e-2, + 2.402264791363012e-1, + 6.931472028550421e-1, + 1.000000000000000, + ]; + + let x = x.min(127f32).max(-127f32); + + let ipart = (x + 0.5).floor(); + let fpart = x - ipart; + + // 2^ipart + let two_pow_ipart = f32::from_bits((((ipart as i32) + 127) as u32) << 23); + + let mut y = EXP2P[0]; + y = y * fpart + EXP2P[1]; + y = y * fpart + EXP2P[2]; + y = y * fpart + EXP2P[3]; + y = y * fpart + EXP2P[4]; + y = y * fpart + EXP2P[5]; + y = y * fpart + EXP2P[6]; + y * two_pow_ipart + } +} + #[cfg(test)] mod test { use proptest::prelude::*; - use crate::{exp2f, ssigmoid}; - fn close_enough(a: f32, b: f32) -> bool { fn max(a: f32, b: f32) -> f32 { if a < b { @@ -254,129 +364,55 @@ mod test { } } let rtol = 1e-05; - let atol = 1e-08; - return (a - b).abs() <= max(rtol * max(a.abs(), b.abs()), atol); + let atol = 1e-06; + let result = (a - b).abs() <= max(rtol * max(a.abs(), b.abs()), atol); + if !result { + dbg!(a, b); + } + return result } proptest! { #[test] fn test_relu(x in any::()) { - prop_assert_eq!(super::definitions::relu().compute(x), x.max(0f32)) + prop_assert_eq!(super::definitions::relu().compute(x), super::reference::relu(x)) } #[test] fn test_affine(x in any::(), alpha in any::(), beta in any::()) { - prop_assert_eq!(super::definitions::affine(alpha, beta).compute(x), alpha * x + beta) + prop_assert_eq!(super::definitions::affine(alpha, beta).compute(x), + super::reference::affine(x, alpha, beta)) } #[test] fn test_leaky_relu(x in any::(), alpha in any::()) { - prop_assert_eq!(super::definitions::leaky_relu(alpha).compute(x), if x > 0f32 { x } else { alpha * x }); + prop_assert_eq!(super::definitions::leaky_relu(alpha).compute(x),super::reference::leaky_relu(x, alpha)) } #[test] fn test_threshold_relu(x in any::(), alpha in any::()) { - prop_assert_eq!(super::definitions::threshold_relu(alpha).compute(x), if x >= alpha { x } else { 0f32 } ); + prop_assert_eq!(super::definitions::threshold_relu(alpha).compute(x), super::reference::threshold_relu(x, alpha) ); } #[test] fn test_subsign(x in any::()) { - prop_assert!(close_enough(super::definitions::softsign().compute(x), x / (1.+x.abs()))); - } - - #[test] - fn test_hardswish(x in any::()) { - prop_assert!(close_enough(super::definitions::hardswish().compute(x), x * 0f32.max( 1f32.min((1./6.) * x + 0.5)))); - } + prop_assert!(close_enough(super::definitions::softsign().compute(x), super::reference::subsign(x))); + } - #[test] - fn test_sigmoid(x in any::()) { - prop_assert!(close_enough(super::definitions::sigmoid().compute(x), ssigmoid(x))); - } - #[test] - fn test_ref_exp2f(x in any::()) { - prop_assert!(close_enough(exp2f(x), 2f32.powf(x))); - } - #[test] - fn test_cm_exp2f(x in any::()) { - prop_assert!(close_enough(super::definitions::exp2f().compute(x), exp2f(x))); - } - } -} + #[test] + fn test_hardswish(x in any::()) { + prop_assert!(close_enough(super::definitions::hardswish().compute(x), super::reference::hardswish(x))); + } -pub fn ssigmoid(x: f32) -> f32 { - const LOW: f32 = -18.6; - const HIGH: f32 = -LOW; - - const ALPHA_13: f32 = -4.433153405e-18; - const ALPHA_11: f32 = 1.169974371e-14; - const ALPHA_9: f32 = -1.875289645e-11; - const ALPHA_7: f32 = 4.257889523e-8; - const ALPHA_5: f32 = 0.00004811817576; - const ALPHA_3: f32 = 0.008163842030; - const ALPHA_1: f32 = 0.2499999971; - const BETA_6: f32 = 3.922935744e-6; - const BETA_4: f32 = 0.001524872358; - const BETA_2: f32 = 0.1159886749; - const BETA_0: f32 = 1.0; - - let x = x.clamp(LOW, HIGH); - - let x2 = x * x; - - let p = ALPHA_13; - let p = x2 * p + ALPHA_11; - let p = x2 * p + ALPHA_9; - let p = x2 * p + ALPHA_7; - let p = x2 * p + ALPHA_5; - let p = x2 * p + ALPHA_3; - let p = x2 * p + ALPHA_1; - // a=p, b=x2, c=x - // swap(b,c) - // a=p, b=x, c=x2 - // mul - let p = p * x; - - // a=p, b=x, c=x2 - // mov(b, c) b = x2 - // mov(c, a) a = p - let q = BETA_6; - let q = x2 * q + BETA_4; - let q = x2 * q + BETA_2; - let q = x2 * q + BETA_0; - dbg!(p, q); - - p / q + 0.5 -} + #[test] + fn test_sigmoid(x in any::()) { + prop_assert!(close_enough(super::definitions::sigmoid().compute(x), super::reference::sigmoid(x))); + } -pub fn exp2f(x: f32) -> f32 { - const EXP2P: [f32; 7] = [ - 1.535336188319500e-4, - 1.339887440266574e-3, - 9.618437357674640e-3, - 5.550332471162809e-2, - 2.402264791363012e-1, - 6.931472028550421e-1, - 1.000000000000000, - ]; - - let x = x.min(127f32).max(-127f32); - - let ipart = (x + 0.5).floor(); - let fpart = x - ipart; - - // 2^ipart - let two_pow_ipart = f32::from_bits((((ipart as i32) + 127) as u32) << 23); - - let mut y = EXP2P[0]; - y = y * fpart + EXP2P[1]; - y = y * fpart + EXP2P[2]; - y = y * fpart + EXP2P[3]; - y = y * fpart + EXP2P[4]; - y = y * fpart + EXP2P[5]; - y = y * fpart + EXP2P[6]; - y * two_pow_ipart + #[test] + fn test_cm_exp2f(x in any::()) { + prop_assert!(close_enough(super::definitions::exp2f().compute(x), super::reference::exp2f(x))); + } + } } - -fn main() {} From c185a188e47dbdad03613229c6b6db3c0878fbe9 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 3 Apr 2023 21:38:07 +0200 Subject: [PATCH 05/25] bench several funcs --- linalg/activations/benches/vm.rs | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/linalg/activations/benches/vm.rs b/linalg/activations/benches/vm.rs index 14c226623b..e8527def60 100644 --- a/linalg/activations/benches/vm.rs +++ b/linalg/activations/benches/vm.rs @@ -1,8 +1,8 @@ -use activations::{definitions, reference}; -use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, BatchSize}; +use activations::{definitions, reference, Program}; +use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion}; -pub fn criterion_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("relu"); +fn crit(c: &mut Criterion, name: &str, r: impl Fn(f32) -> f32, prog: &Program) { + let mut group = c.benchmark_group(name); for size in [1i32, 32, 256, 1024, 8192].iter() { group.throughput(criterion::Throughput::Elements(*size as u64)); group.bench_with_input(BenchmarkId::new("Reference", size), size, |b, size| { @@ -10,26 +10,32 @@ pub fn criterion_benchmark(c: &mut Criterion) { || vec![1.0f32; *size as usize], |v| { for x in v { - reference::relu(black_box(x)); + r(black_box(x)); } }, - BatchSize::LargeInput - ) + BatchSize::LargeInput, + ) }); - let d = definitions::relu(); group.bench_with_input(BenchmarkId::new("VM", size), size, |b, size| { b.iter_batched( || vec![1.0f32; *size as usize], |v| { for x in v { - d.compute(black_box(x)); + prog.compute(black_box(x)); } }, - BatchSize::LargeInput - ) + BatchSize::LargeInput, + ) }); } } +fn criterion_benchmark(c: &mut Criterion) { + crit(c, "relu", reference::relu, &definitions::relu()); + crit(c, "hardswish", reference::hardswish, &definitions::hardswish()); + crit(c, "exp2f", reference::exp2f, &definitions::exp2f()); + crit(c, "sigmoid", reference::sigmoid, &definitions::sigmoid()); +} + criterion_group!(benches, criterion_benchmark); criterion_main!(benches); From cfb20a669e1c2eefb66cfc6e87544d95396897f9 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 3 Apr 2023 22:37:22 +0200 Subject: [PATCH 06/25] vectorizing computation --- linalg/activations/benches/vm.rs | 9 + linalg/activations/src/definitions.rs | 164 ++++++++++ linalg/activations/src/lib.rs | 436 +++++++------------------- linalg/activations/src/reference.rs | 110 +++++++ 4 files changed, 399 insertions(+), 320 deletions(-) create mode 100644 linalg/activations/src/definitions.rs create mode 100644 linalg/activations/src/reference.rs diff --git a/linalg/activations/benches/vm.rs b/linalg/activations/benches/vm.rs index e8527def60..ec61f6c447 100644 --- a/linalg/activations/benches/vm.rs +++ b/linalg/activations/benches/vm.rs @@ -27,6 +27,15 @@ fn crit(c: &mut Criterion, name: &str, r: impl Fn(f32) -> f32, prog: &Program) { BatchSize::LargeInput, ) }); + group.bench_with_input(BenchmarkId::new("VMVec", size), size, |b, size| { + b.iter_batched( + || vec![1.0f32; *size as usize], + |mut v| { + prog.compute_slice(black_box(&mut v)); + }, + BatchSize::LargeInput, + ) + }); } } diff --git a/linalg/activations/src/definitions.rs b/linalg/activations/src/definitions.rs new file mode 100644 index 0000000000..21d5ae86b8 --- /dev/null +++ b/linalg/activations/src/definitions.rs @@ -0,0 +1,164 @@ + +use super::Op::*; +use super::RegisterId::*; +use super::*; + +pub fn relu() -> Program { + Program { ops: vec![MaxConst(0)], csts: vec![] } +} + +pub fn affine(alpha: f32, beta: f32) -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + MulConst(2), + AddConst(3), + ], + csts: vec![alpha, beta], + } +} + +pub fn leaky_relu(alpha: f32) -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B,A), + MulConst(2), + Move(C,A), + Move(A,B), + IfPosTE, + ], + csts: vec![alpha], + } +} + +pub fn threshold_relu(alpha: f32) -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B,A), + SubConst(2), + Load(C,0), + IfPosTE, + ], + csts: vec![alpha], + } +} + +pub fn softsign() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B,A), + Abs, + AddConst(1), + Recip, + Mul, + ], + csts: vec![], + } +} + +pub fn hardswish() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B, A), + MulConst(2), + AddConst(3), + MinConst(1), + MaxConst(0), + Mul, + ], + csts: vec![1f32 / 6., 0.5], + } +} + +pub fn sigmoid() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + MinConst(3), + MaxConst(2), + Move(B, A), // b = x + Move(C, A), // c = x + Mul, // a = x2 + Move(B, A), // b = x2 + MulConst(4), + AddConst(5), // a = x2 * a13 + a11 + FMA(6), + FMA(7), + FMA(8), + FMA(9), + FMA(10), + SwapBC, // c = x2, b = x + Mul, // a = p(x) + Move(B, C), // b = x2 + Move(C, A), // c = p(x) + Move(A, B), // a = x2 + MulConst(11), + AddConst(12), + FMA(13), + FMA(1), // a = q(x) + Recip, + Move(B,C), // b = p(x) + Mul, + AddConst(14) + ], + csts: vec![ + -18.6, // const 2 + 18.6, // const 3 + -4.433153405e-18, // const 4, also alpha_13 + 1.169974371e-14, // const 5, also a11 + -1.875289645e-11, + 4.257889523e-8, + 0.00004811817576, // const 8 + 0.008163842030, + 0.2499999971, // alpha_1 + 3.922935744e-6, // beta_6 + 0.001524872358, // const 12 + 0.1159886749, + 0.5, //beta_0 + ], + } +} + +pub fn exp2f() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + MinConst(2), + MaxConst(3), + Move(B, A), // b = x + AddConst(4), // a = x + 0.5 + Floor, // a = ipart + Move(C, A), // c = ipart + Move(A, B), // a = x + Move(B, C), // b = ipart + Sub, // a = fpart + Move(B, A), // b = fpart + Load(A, 5), // a = exp2p[0] + FMA(6), + FMA(7), + FMA(8), + FMA(9), + FMA(10), + FMA(1), // a = y + Move(B, A), + Move(A, C), + TwoPowOfInt, + Mul + ], + csts: vec![ + 127f32, + -127f32, + 0.5, + 1.535336188319500e-4, + 1.339887440266574e-3, + 9.618437357674640e-3, + 5.550332471162809e-2, + 2.402264791363012e-1, + 6.931472028550421e-1, + ], + } +} diff --git a/linalg/activations/src/lib.rs b/linalg/activations/src/lib.rs index d826500bbb..f0a55d4638 100644 --- a/linalg/activations/src/lib.rs +++ b/linalg/activations/src/lib.rs @@ -1,9 +1,11 @@ +pub mod definitions; +pub mod reference; + #[derive(Copy, Clone, Debug, PartialEq)] pub enum RegisterId { A = 0, B = 1, C = 2, - D = 3, } type ConstantId = usize; @@ -38,8 +40,72 @@ pub struct Program { } impl Program { + pub fn compute_slice(&self, xs: &mut [f32]) { + let mut a = xs.to_vec(); + let mut b = vec![0.0f32; a.len()]; + let mut c = vec![0.0f32; a.len()]; + let mut constants = self.csts.clone(); + constants.insert(0, 0f32); + constants.insert(1, 1f32); + for op in &self.ops { + match op { + Op::Move(dst, src) => { + let mut regs = [&mut a, &mut b, &mut c]; + let dst = *dst as usize; + let src = *src as usize; + if dst < src { + let (left, right) = regs.split_at_mut(src); + let d = &mut **left[dst]; + let s = &**right[0]; + d.copy_from_slice(s) + } else { + let (left, right) = regs.split_at_mut(dst); + let s = &**left[src]; + let d = &mut **right[0]; + d.copy_from_slice(s) + } + } + Op::Load(dst, cst) if *dst == RegisterId::A => { + a.iter_mut().for_each(|x| *x = constants[*cst]) + } + Op::Load(dst, cst) if *dst == RegisterId::B => { + b.iter_mut().for_each(|x| *x = constants[*cst]) + } + Op::Load(_dst, cst) => c.iter_mut().for_each(|x| *x = constants[*cst]), + Op::Abs => a.iter_mut().for_each(|x| *x = x.abs()), + Op::Recip => a.iter_mut().for_each(|x| *x = x.recip()), + Op::Add => a.iter_mut().zip(&b).for_each(|(x, y)| *x += *y), + Op::Sub => a.iter_mut().zip(&b).for_each(|(x, y)| *x -= *y), + Op::Mul => a.iter_mut().zip(&b).for_each(|(x, y)| *x *= *y), + Op::Min => a.iter_mut().zip(&b).for_each(|(x, y)| *x = x.min(*y)), + Op::Max => a.iter_mut().zip(&b).for_each(|(x, y)| *x = x.max(*y)), + Op::AddConst(cst) => a.iter_mut().for_each(|x| *x += constants[*cst]), + Op::SubConst(cst) => a.iter_mut().for_each(|x| *x -= constants[*cst]), + Op::MulConst(cst) => a.iter_mut().for_each(|x| *x *= constants[*cst]), + Op::MinConst(cst) => a.iter_mut().for_each(|x| *x = x.min(constants[*cst])), + Op::MaxConst(cst) => a.iter_mut().for_each(|x| *x = x.max(constants[*cst])), + Op::IfPosTE => a + .iter_mut() + .zip(&b) + .zip(&c) + .for_each(|((x, y), z)| *x = if *x >= 0f32 { *y } else { *z }), + Op::FMA(cst) => { + a.iter_mut().zip(&b).for_each(|(x, y)| *x = *x * *y + constants[*cst]) + } + Op::SwapBC => { + b.iter_mut().zip(c.iter_mut()).for_each(|(b, c)| std::mem::swap(b, c)) + } + Op::Floor => a.iter_mut().for_each(|x| *x = x.floor()), + Op::TwoPowOfInt => a + .iter_mut() + .for_each(|x| *x = f32::from_bits((((*x as i32) + 127) as u32) << 23)), + } + } + xs.copy_from_slice(&a) + } + pub fn compute(&self, x: f32) -> f32 { - let mut regs = [0f32; 4]; + let mut regs = [0f32; 3]; regs[0] = x; let mut constants = self.csts.clone(); constants.insert(0, 0f32); @@ -73,287 +139,8 @@ impl Program { } } -pub mod definitions { - use super::Op::*; - use super::RegisterId::*; - use super::*; - - pub fn relu() -> Program { - Program { ops: vec![MaxConst(0)], csts: vec![] } - } - - pub fn affine(alpha: f32, beta: f32) -> Program { - Program { - #[rustfmt::skip] - ops: vec![ - MulConst(2), - AddConst(3), - ], - csts: vec![alpha, beta], - } - } - - pub fn leaky_relu(alpha: f32) -> Program { - Program { - #[rustfmt::skip] - ops: vec![ - Move(B,A), - MulConst(2), - Move(C,A), - Move(A,B), - IfPosTE, - ], - csts: vec![alpha], - } - } - - pub fn threshold_relu(alpha: f32) -> Program { - Program { - #[rustfmt::skip] - ops: vec![ - Move(B,A), - SubConst(2), - Load(C,0), - IfPosTE, - ], - csts: vec![alpha], - } - } - - pub fn softsign() -> Program { - Program { - #[rustfmt::skip] - ops: vec![ - Move(B,A), - Abs, - AddConst(1), - Recip, - Mul, - ], - csts: vec![], - } - } - - pub fn hardswish() -> Program { - Program { - #[rustfmt::skip] - ops: vec![ - Move(B, A), - MulConst(2), - AddConst(3), - MinConst(1), - MaxConst(0), - Mul, - ], - csts: vec![1f32 / 6., 0.5], - } - } - - pub fn sigmoid() -> Program { - Program { - #[rustfmt::skip] - ops: vec![ - MinConst(3), - MaxConst(2), - Move(B, A), // b = x - Move(C, A), // c = x - Mul, // a = x2 - Move(B, A), // b = x2 - MulConst(4), - AddConst(5), // a = x2 * a13 + a11 - FMA(6), - FMA(7), - FMA(8), - FMA(9), - FMA(10), - SwapBC, // c = x2, b = x - Mul, // a = p(x) - Move(B, C), // b = x2 - Move(C, A), // c = p(x) - Move(A, B), // a = x2 - MulConst(11), - AddConst(12), - FMA(13), - FMA(1), // a = q(x) - Recip, - Move(B,C), // b = p(x) - Mul, - AddConst(14) - ], - csts: vec![ - -18.6, // const 2 - 18.6, // const 3 - -4.433153405e-18, // const 4, also alpha_13 - 1.169974371e-14, // const 5, also a11 - -1.875289645e-11, - 4.257889523e-8, - 0.00004811817576, // const 8 - 0.008163842030, - 0.2499999971, // alpha_1 - 3.922935744e-6, // beta_6 - 0.001524872358, // const 12 - 0.1159886749, - 0.5, //beta_0 - ], - } - } - - pub fn exp2f() -> Program { - Program { - #[rustfmt::skip] - ops: vec![ - MinConst(2), - MaxConst(3), - Move(B, A), // b = x - AddConst(4), // a = x + 0.5 - Floor, // a = ipart - Move(C, A), // c = ipart - Move(A, B), // a = x - Move(B, C), // b = ipart - Sub, // a = fpart - Move(B, A), // b = fpart - Load(A, 5), // a = exp2p[0] - FMA(6), - FMA(7), - FMA(8), - FMA(9), - FMA(10), - FMA(1), // a = y - Move(B, A), - Move(A, C), - TwoPowOfInt, - Mul - ], - csts: vec![ - 127f32, - -127f32, - 0.5, - 1.535336188319500e-4, - 1.339887440266574e-3, - 9.618437357674640e-3, - 5.550332471162809e-2, - 2.402264791363012e-1, - 6.931472028550421e-1, - ], - } - } -} - -pub mod reference { - pub fn relu(x: f32) -> f32 { - x.max(0f32) - } - - pub fn affine(x: f32, alpha: f32, beta: f32) -> f32 { - alpha * x + beta - } - - pub fn leaky_relu(x: f32, alpha: f32) -> f32 { - if x > 0f32 { - x - } else { - alpha * x - } - } - - pub fn threshold_relu(x: f32, alpha: f32) -> f32 { - if x >= alpha { - x - } else { - 0f32 - } - } - - pub fn subsign(x: f32) -> f32 { - x / (1. + x.abs()) - } - - pub fn hardswish(x: f32) -> f32 { - x * 0f32.max(1f32.min((1. / 6.) * x + 0.5)) - } - - pub fn sigmoid(x: f32) -> f32 { - ssigmoid(x) - } - - pub fn ref_exp2f(x: f32) -> f32 { - 2f32.powf(x) - } - - pub fn cm_exp2f(x: f32) -> f32 { - exp2f(x) - } - - fn ssigmoid(x: f32) -> f32 { - const LOW: f32 = -18.6; - const HIGH: f32 = -LOW; - - const ALPHA_13: f32 = -4.433153405e-18; - const ALPHA_11: f32 = 1.169974371e-14; - const ALPHA_9: f32 = -1.875289645e-11; - const ALPHA_7: f32 = 4.257889523e-8; - const ALPHA_5: f32 = 0.00004811817576; - const ALPHA_3: f32 = 0.008163842030; - const ALPHA_1: f32 = 0.2499999971; - const BETA_6: f32 = 3.922935744e-6; - const BETA_4: f32 = 0.001524872358; - const BETA_2: f32 = 0.1159886749; - const BETA_0: f32 = 1.0; - - let x = x.clamp(LOW, HIGH); - - let x2 = x * x; - - let p = ALPHA_13; - let p = x2 * p + ALPHA_11; - let p = x2 * p + ALPHA_9; - let p = x2 * p + ALPHA_7; - let p = x2 * p + ALPHA_5; - let p = x2 * p + ALPHA_3; - let p = x2 * p + ALPHA_1; - let p = p * x; - - let q = BETA_6; - let q = x2 * q + BETA_4; - let q = x2 * q + BETA_2; - let q = x2 * q + BETA_0; - - p / q + 0.5 - } - - pub fn exp2f(x: f32) -> f32 { - const EXP2P: [f32; 7] = [ - 1.535336188319500e-4, - 1.339887440266574e-3, - 9.618437357674640e-3, - 5.550332471162809e-2, - 2.402264791363012e-1, - 6.931472028550421e-1, - 1.000000000000000, - ]; - - let x = x.min(127f32).max(-127f32); - - let ipart = (x + 0.5).floor(); - let fpart = x - ipart; - - // 2^ipart - let two_pow_ipart = f32::from_bits((((ipart as i32) + 127) as u32) << 23); - - let mut y = EXP2P[0]; - y = y * fpart + EXP2P[1]; - y = y * fpart + EXP2P[2]; - y = y * fpart + EXP2P[3]; - y = y * fpart + EXP2P[4]; - y = y * fpart + EXP2P[5]; - y = y * fpart + EXP2P[6]; - y * two_pow_ipart - } -} - #[cfg(test)] mod test { - use proptest::prelude::*; fn close_enough(a: f32, b: f32) -> bool { fn max(a: f32, b: f32) -> f32 { @@ -365,54 +152,63 @@ mod test { } let rtol = 1e-05; let atol = 1e-06; - let result = (a - b).abs() <= max(rtol * max(a.abs(), b.abs()), atol); + let result = (a.is_infinite() && b.is_infinite() && a.signum() == b.signum()) + || ((a - b).abs() <= max(rtol * max(a.abs(), b.abs()), atol)); if !result { dbg!(a, b); } - return result + return result; } - proptest! { - #[test] - fn test_relu(x in any::()) { - prop_assert_eq!(super::definitions::relu().compute(x), super::reference::relu(x)) - } + mod scalar { + use proptest::prelude::*; + use super::close_enough; - #[test] - fn test_affine(x in any::(), alpha in any::(), beta in any::()) { - prop_assert_eq!(super::definitions::affine(alpha, beta).compute(x), - super::reference::affine(x, alpha, beta)) - } - - #[test] - fn test_leaky_relu(x in any::(), alpha in any::()) { - prop_assert_eq!(super::definitions::leaky_relu(alpha).compute(x),super::reference::leaky_relu(x, alpha)) + macro_rules! prop_activation { + ($name: ident ( $($param:ident),* )) => { + proptest! { + #[test] + fn $name(x in any::(), $($param in any::()),*) { + prop_assert!(close_enough(crate::definitions::$name($($param),*).compute(x),crate::reference::$name(x, $($param),*))) + } + } + } } - #[test] - fn test_threshold_relu(x in any::(), alpha in any::()) { - prop_assert_eq!(super::definitions::threshold_relu(alpha).compute(x), super::reference::threshold_relu(x, alpha) ); + prop_activation!(relu()); + prop_activation!(affine(alpha, beta)); + prop_activation!(leaky_relu(alpha)); + prop_activation!(threshold_relu(alpha)); + prop_activation!(softsign()); + prop_activation!(hardswish()); + prop_activation!(sigmoid()); + prop_activation!(exp2f()); + } + + mod vector { + use proptest::prelude::*; + use super::close_enough; + + macro_rules! prop_activation { + ($name: ident ( $($param:ident),* )) => { + proptest! { + #[test] + fn $name(x in any::(), $($param in any::()),*) { + let mut slice = [x]; + crate::definitions::$name($($param),*).compute_slice(&mut slice); + prop_assert!(close_enough(slice[0], crate::reference::$name(x, $($param),*))) + } + } + } } - #[test] - fn test_subsign(x in any::()) { - prop_assert!(close_enough(super::definitions::softsign().compute(x), super::reference::subsign(x))); - } - - - #[test] - fn test_hardswish(x in any::()) { - prop_assert!(close_enough(super::definitions::hardswish().compute(x), super::reference::hardswish(x))); - } - - #[test] - fn test_sigmoid(x in any::()) { - prop_assert!(close_enough(super::definitions::sigmoid().compute(x), super::reference::sigmoid(x))); - } - - #[test] - fn test_cm_exp2f(x in any::()) { - prop_assert!(close_enough(super::definitions::exp2f().compute(x), super::reference::exp2f(x))); - } + prop_activation!(relu()); + prop_activation!(affine(alpha, beta)); + prop_activation!(leaky_relu(alpha)); + prop_activation!(threshold_relu(alpha)); + prop_activation!(softsign()); + prop_activation!(hardswish()); + prop_activation!(sigmoid()); + prop_activation!(exp2f()); } } diff --git a/linalg/activations/src/reference.rs b/linalg/activations/src/reference.rs new file mode 100644 index 0000000000..525fd849f0 --- /dev/null +++ b/linalg/activations/src/reference.rs @@ -0,0 +1,110 @@ + +pub fn relu(x: f32) -> f32 { + x.max(0f32) +} + +pub fn affine(x: f32, alpha: f32, beta: f32) -> f32 { + alpha * x + beta +} + +pub fn leaky_relu(x: f32, alpha: f32) -> f32 { + if x > 0f32 { + x + } else { + alpha * x + } +} + +pub fn threshold_relu(x: f32, alpha: f32) -> f32 { + if x >= alpha { + x + } else { + 0f32 + } +} + +pub fn softsign(x: f32) -> f32 { + x / (1. + x.abs()) +} + +pub fn hardswish(x: f32) -> f32 { + x * 0f32.max(1f32.min((1. / 6.) * x + 0.5)) +} + +pub fn sigmoid(x: f32) -> f32 { + ssigmoid(x) +} + +pub fn ref_exp2f(x: f32) -> f32 { + 2f32.powf(x) +} + +pub fn cm_exp2f(x: f32) -> f32 { + exp2f(x) +} + +fn ssigmoid(x: f32) -> f32 { + const LOW: f32 = -18.6; + const HIGH: f32 = -LOW; + + const ALPHA_13: f32 = -4.433153405e-18; + const ALPHA_11: f32 = 1.169974371e-14; + const ALPHA_9: f32 = -1.875289645e-11; + const ALPHA_7: f32 = 4.257889523e-8; + const ALPHA_5: f32 = 0.00004811817576; + const ALPHA_3: f32 = 0.008163842030; + const ALPHA_1: f32 = 0.2499999971; + const BETA_6: f32 = 3.922935744e-6; + const BETA_4: f32 = 0.001524872358; + const BETA_2: f32 = 0.1159886749; + const BETA_0: f32 = 1.0; + + let x = x.clamp(LOW, HIGH); + + let x2 = x * x; + + let p = ALPHA_13; + let p = x2 * p + ALPHA_11; + let p = x2 * p + ALPHA_9; + let p = x2 * p + ALPHA_7; + let p = x2 * p + ALPHA_5; + let p = x2 * p + ALPHA_3; + let p = x2 * p + ALPHA_1; + let p = p * x; + + let q = BETA_6; + let q = x2 * q + BETA_4; + let q = x2 * q + BETA_2; + let q = x2 * q + BETA_0; + + p / q + 0.5 +} + +pub fn exp2f(x: f32) -> f32 { + const EXP2P: [f32; 7] = [ + 1.535336188319500e-4, + 1.339887440266574e-3, + 9.618437357674640e-3, + 5.550332471162809e-2, + 2.402264791363012e-1, + 6.931472028550421e-1, + 1.000000000000000, + ]; + + let x = x.min(127f32).max(-127f32); + + let ipart = (x + 0.5).floor(); + let fpart = x - ipart; + + // 2^ipart + let two_pow_ipart = f32::from_bits((((ipart as i32) + 127) as u32) << 23); + + let mut y = EXP2P[0]; + y = y * fpart + EXP2P[1]; + y = y * fpart + EXP2P[2]; + y = y * fpart + EXP2P[3]; + y = y * fpart + EXP2P[4]; + y = y * fpart + EXP2P[5]; + y = y * fpart + EXP2P[6]; + y * two_pow_ipart +} From e563e65a42cf393eb7ed324bb0cb130620a23789 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 10 Apr 2023 21:30:53 +0200 Subject: [PATCH 07/25] split ew helper from ew --- linalg/src/frame.rs | 1 + linalg/src/frame/element_wise.rs | 81 +++--------------------- linalg/src/frame/element_wise_helper.rs | 83 +++++++++++++++++++++++++ linalg/src/lib.rs | 3 + 4 files changed, 95 insertions(+), 73 deletions(-) create mode 100644 linalg/src/frame/element_wise_helper.rs diff --git a/linalg/src/frame.rs b/linalg/src/frame.rs index d59d1fde54..d4c9dbc14f 100644 --- a/linalg/src/frame.rs +++ b/linalg/src/frame.rs @@ -9,6 +9,7 @@ pub mod pack; pub mod sigmoid; #[macro_use] pub mod tanh; +pub mod element_wise_helper; pub use pack::Packer; pub use pack::PackingWriter; diff --git a/linalg/src/frame/element_wise.rs b/linalg/src/frame/element_wise.rs index 11a5a865d1..bb8b7a6d0e 100644 --- a/linalg/src/frame/element_wise.rs +++ b/linalg/src/frame/element_wise.rs @@ -1,10 +1,12 @@ -use std::alloc::*; use std::fmt::Debug; use std::marker::PhantomData; -use tract_data::anyhow; + +use tract_data::TractResult; use crate::LADatum; +use super::element_wise_helper::run_over_slice_with_alignment; + macro_rules! ew_impl { ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr) => { paste! { @@ -44,53 +46,11 @@ macro_rules! ew_impl { }; } -struct TempBuffer { - layout: Layout, - buffer: *mut u8, -} - -impl Default for TempBuffer { - fn default() -> Self { - TempBuffer { layout: Layout::new::<()>(), buffer: std::ptr::null_mut() } - } -} - -impl TempBuffer { - fn ensure(&mut self, size: usize, alignment: usize) { - unsafe { - if size > self.layout.size() || alignment > self.layout.align() { - let size = size.max(self.layout.size()); - let alignment = alignment.max(self.layout.align()); - if !self.buffer.is_null() { - std::alloc::dealloc(self.buffer, self.layout); - } - self.layout = Layout::from_size_align_unchecked(size, alignment); - self.buffer = std::alloc::alloc(self.layout); - assert!(!self.buffer.is_null()); - } - } - } -} - -impl Drop for TempBuffer { - fn drop(&mut self) { - unsafe { - if !self.buffer.is_null() { - std::alloc::dealloc(self.buffer, self.layout); - } - } - } -} - -std::thread_local! { - static TMP: std::cell::RefCell = std::cell::RefCell::new(TempBuffer::default()); -} - pub trait ElementWise: Send + Sync + Debug + dyn_clone::DynClone where T: Copy + Debug + PartialEq + Send + Sync, { - fn run(&self, vec: &mut [T]) -> anyhow::Result<()>; + fn run(&self, vec: &mut [T]) -> TractResult<()>; } dyn_clone::clone_trait_object!( ElementWise where T: Copy); @@ -109,37 +69,12 @@ where T: LADatum, K: ElementWiseKer + Clone, { - fn run(&self, vec: &mut [T]) -> anyhow::Result<()> { - if vec.is_empty() { - return Ok(()); - } - unsafe { - TMP.with(|buffer| { - let mut buffer = buffer.borrow_mut(); - buffer.ensure(K::nr() * T::datum_type().size_of(), K::alignment_bytes()); - let tmp = std::slice::from_raw_parts_mut(buffer.buffer as *mut T, K::nr()); - let mut compute_via_temp_buffer = |slice: &mut [T]| { - tmp[..slice.len()].copy_from_slice(slice); - K::run(tmp); - slice.copy_from_slice(&tmp[..slice.len()]) - }; - let prefix_len = vec.as_ptr().align_offset(K::alignment_bytes()).min(vec.len()); - if prefix_len > 0 { - compute_via_temp_buffer(&mut vec[..prefix_len]); - } - let aligned_len = (vec.len() - prefix_len) / K::nr() * K::nr(); - if aligned_len > 0 { - K::run(&mut vec[prefix_len..][..aligned_len]); - } - if prefix_len + aligned_len < vec.len() { - compute_via_temp_buffer(&mut vec[prefix_len + aligned_len..]); - } - }) - } - Ok(()) + fn run(&self, vec: &mut [T]) -> TractResult<()> { + run_over_slice_with_alignment(vec, K::run, K::nr(), K::alignment_bytes()) } } + pub trait ElementWiseKer: Send + Sync + Debug + dyn_clone::DynClone + Clone + 'static where T: LADatum, diff --git a/linalg/src/frame/element_wise_helper.rs b/linalg/src/frame/element_wise_helper.rs new file mode 100644 index 0000000000..3116217b11 --- /dev/null +++ b/linalg/src/frame/element_wise_helper.rs @@ -0,0 +1,83 @@ +use crate::LADatum; +use std::alloc::*; +use tract_data::TractResult; + +pub(crate) fn run_over_slice_with_alignment( + vec: &mut [T], + f: impl Fn(&mut [T]), + nr: usize, + alignment_bytes: usize, +) -> TractResult<()> +where + T: LADatum, +{ + if vec.is_empty() { + return Ok(()); + } + unsafe { + TMP.with(|buffer| { + let mut buffer = buffer.borrow_mut(); + buffer.ensure(nr * T::datum_type().size_of(), alignment_bytes); + let tmp = std::slice::from_raw_parts_mut(buffer.buffer as *mut T, nr); + let mut compute_via_temp_buffer = |slice: &mut [T]| { + tmp[..slice.len()].copy_from_slice(slice); + f(tmp); + slice.copy_from_slice(&tmp[..slice.len()]) + }; + let prefix_len = vec.as_ptr().align_offset(alignment_bytes).min(vec.len()); + if prefix_len > 0 { + compute_via_temp_buffer(&mut vec[..prefix_len]); + } + let aligned_len = (vec.len() - prefix_len) / nr * nr; + if aligned_len > 0 { + f(&mut vec[prefix_len..][..aligned_len]); + } + if prefix_len + aligned_len < vec.len() { + compute_via_temp_buffer(&mut vec[prefix_len + aligned_len..]); + } + }) + } + Ok(()) +} + +std::thread_local! { + static TMP: std::cell::RefCell = std::cell::RefCell::new(TempBuffer::default()); +} + +pub struct TempBuffer { + pub layout: Layout, + pub buffer: *mut u8, +} + +impl Default for TempBuffer { + fn default() -> Self { + TempBuffer { layout: Layout::new::<()>(), buffer: std::ptr::null_mut() } + } +} + +impl TempBuffer { + pub fn ensure(&mut self, size: usize, alignment: usize) { + unsafe { + if size > self.layout.size() || alignment > self.layout.align() { + let size = size.max(self.layout.size()); + let alignment = alignment.max(self.layout.align()); + if !self.buffer.is_null() { + std::alloc::dealloc(self.buffer, self.layout); + } + self.layout = Layout::from_size_align_unchecked(size, alignment); + self.buffer = std::alloc::alloc(self.layout); + assert!(!self.buffer.is_null()); + } + } + } +} + +impl Drop for TempBuffer { + fn drop(&mut self) { + unsafe { + if !self.buffer.is_null() { + std::alloc::dealloc(self.buffer, self.layout); + } + } + } +} diff --git a/linalg/src/lib.rs b/linalg/src/lib.rs index 6697bc4523..ea66629e8e 100644 --- a/linalg/src/lib.rs +++ b/linalg/src/lib.rs @@ -113,6 +113,9 @@ pub fn generic() -> Ops { tanh_f32: Box::new(|| generic::STanh4::ew()), erf_f32: Box::new(|| generic::SErf4::ew()), lut_u8: Box::new(|table: &[u8]| Box::new(lut::LutImpl::::new(table))), + /* + activation_f32: Box::new(|microcode| generic::SActivation::new(microcode)) + */ } } From 9ce7000e73fcad55d233a4a406847620922bde3e Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 10 Apr 2023 23:30:00 +0200 Subject: [PATCH 08/25] moving everything inside tract --- linalg/activations/src/lib.rs | 4 +- linalg/src/frame.rs | 2 + linalg/src/frame/activations.rs | 143 ++++++++++++++++ linalg/src/frame/activations/definitions.rs | 175 ++++++++++++++++++++ linalg/src/frame/activations/reference.rs | 110 ++++++++++++ linalg/src/frame/activations/tests.rs | 35 ++++ linalg/src/generic.rs | 1 + linalg/src/generic/activations.rs | 98 +++++++++++ linalg/src/lib.rs | 3 +- 9 files changed, 568 insertions(+), 3 deletions(-) create mode 100644 linalg/src/frame/activations.rs create mode 100644 linalg/src/frame/activations/definitions.rs create mode 100644 linalg/src/frame/activations/reference.rs create mode 100644 linalg/src/frame/activations/tests.rs create mode 100644 linalg/src/generic/activations.rs diff --git a/linalg/activations/src/lib.rs b/linalg/activations/src/lib.rs index f0a55d4638..dbb0f5c44a 100644 --- a/linalg/activations/src/lib.rs +++ b/linalg/activations/src/lib.rs @@ -161,8 +161,8 @@ mod test { } mod scalar { - use proptest::prelude::*; use super::close_enough; + use proptest::prelude::*; macro_rules! prop_activation { ($name: ident ( $($param:ident),* )) => { @@ -186,8 +186,8 @@ mod test { } mod vector { - use proptest::prelude::*; use super::close_enough; + use proptest::prelude::*; macro_rules! prop_activation { ($name: ident ( $($param:ident),* )) => { diff --git a/linalg/src/frame.rs b/linalg/src/frame.rs index d4c9dbc14f..28f6761da8 100644 --- a/linalg/src/frame.rs +++ b/linalg/src/frame.rs @@ -1,4 +1,6 @@ #[macro_use] +pub mod activations; +#[macro_use] pub mod element_wise; #[macro_use] pub mod lut; diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs new file mode 100644 index 0000000000..8b604b4998 --- /dev/null +++ b/linalg/src/frame/activations.rs @@ -0,0 +1,143 @@ +use std::fmt::Debug; +use std::marker::PhantomData; + +use tract_data::TractResult; + +use crate::LADatum; + +use super::element_wise_helper::run_over_slice_with_alignment; + +pub mod definitions; +pub mod reference; +#[macro_use] +pub mod tests; + +#[derive(Copy, Clone, Debug, PartialEq)] +#[repr(u8)] +pub enum RegisterId { + A = 0, + B = 1, + C = 2, +} + +type ConstantId = u8; + +#[repr(C, u16)] +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Op { + Done, + Move(RegisterId, RegisterId), + Load(RegisterId, ConstantId), + Abs, + Recip, + Add, + Sub, + Mul, + Min, + Max, + AddConst(ConstantId), + SubConst(ConstantId), + MulConst(ConstantId), + MinConst(ConstantId), + MaxConst(ConstantId), + FMA(ConstantId), // a <- a * b + cst + IfPosTE, + SwapBC, + Floor, + TwoPowOfInt, +} + +#[derive(Clone, Debug, PartialEq)] +pub struct Program { + pub ops: Vec, + pub csts: Vec, +} + +pub trait Activation: Send + Sync + Debug + dyn_clone::DynClone { + fn run(&self, prog: &Program, vec: &mut [T]) -> TractResult<()>; +} + +#[derive(Debug, Clone, new)] +pub struct ActivationImpl +where + T: LADatum, + K: ActivationKer + Clone, +{ + phantom: PhantomData<(K, T)>, +} + +impl Activation for ActivationImpl +where + T: LADatum, + K: ActivationKer + Clone, +{ + fn run(&self, program: &Program, vec: &mut [T]) -> TractResult<()> { + run_over_slice_with_alignment( + vec, + |slice| K::run(&program.ops, &*program.csts, slice), + K::nr(), + K::alignment_bytes(), + ) + } +} + +pub trait ActivationKer: Send + Sync + Debug + dyn_clone::DynClone + Clone + 'static +where + T: LADatum, +{ + fn name() -> &'static str; + fn alignment_bytes() -> usize; + fn alignment_items() -> usize; + fn nr() -> usize; + fn run(ops: &[Op], csts: &[T], vec: &mut [T]); + fn act() -> Box> { + Box::new(ActivationImpl::::new()) + } +} + +macro_rules! act_impl { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr) => { + paste! { + mod [] { + #[allow(unused_imports)] + use tract_data::prelude::f16; + extern_kernel!(fn $func(ptr: *mut $ti, count: usize) -> ()); + } + + #[derive(Copy, Clone, Debug)] + #[allow(non_camel_case_types)] + pub struct $func; + + impl ActivationKer<$ti> for $func { + #[inline(always)] + fn name() -> &'static str { + stringify!($func) + } + #[inline(always)] + fn nr() -> usize { + $nr + } + #[inline(always)] + fn alignment_items() -> usize { + $alignment_items + } + #[inline(always)] + fn alignment_bytes() -> usize { + $alignment_items * std::mem::size_of::<$ti>() + } + #[inline(never)] + fn run(ops: &Op, csts:&[T], buf: &mut [$ti]) { + unsafe { []::$func(ops.as_ptr(), csts.as_ptr(), buf.as_mut_ptr(), buf.len()) } + } + } + } + }; +} + +#[cfg(test)] +mod test { + #[test] + fn size_of_op() { + assert_eq!(std::mem::size_of::(), 4); + } +} diff --git a/linalg/src/frame/activations/definitions.rs b/linalg/src/frame/activations/definitions.rs new file mode 100644 index 0000000000..e1f0ff8e59 --- /dev/null +++ b/linalg/src/frame/activations/definitions.rs @@ -0,0 +1,175 @@ +use super::Op::*; +use super::RegisterId::*; +use super::*; + +pub fn relu() -> Program { + Program { ops: vec![MaxConst(0), Done], csts: vec![T::zero()] } +} + +pub fn affine(alpha: T, beta: T) -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + MulConst(0), + AddConst(1), + Done, + ], + csts: vec![alpha, beta], + } +} + +pub fn leaky_relu(alpha: T) -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B,A), + MulConst(0), + Move(C,A), + Move(A,B), + IfPosTE, + Done, + ], + csts: vec![alpha], + } +} + +pub fn threshold_relu(alpha: T) -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B,A), + SubConst(1), + Load(C,0), + IfPosTE, + Done, + ], + csts: vec![T::zero(), alpha], + } +} + +pub fn softsign() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B,A), + Abs, + AddConst(0), + Recip, + Mul, + Done, + ], + csts: vec![T::one()], + } +} + +pub fn hardswish() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + Move(B, A), + MulConst(2), + AddConst(3), + MinConst(1), + MaxConst(0), + Mul, + Done, + ], + csts: vec![ + T::zero(), + T::one(), + T::one() / (T::one() + T::one() + T::one() + T::one() + T::one() + T::one()), // 1/6 + T::one() / (T::one() + T::one()), // 1/2 + ], + } +} + +/* +pub fn sigmoid() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + MinConst(3), + MaxConst(2), + Move(B, A), // b = x + Move(C, A), // c = x + Mul, // a = x2 + Move(B, A), // b = x2 + MulConst(4), + AddConst(5), // a = x2 * a13 + a11 + FMA(6), + FMA(7), + FMA(8), + FMA(9), + FMA(10), + SwapBC, // c = x2, b = x + Mul, // a = p(x) + Move(B, C), // b = x2 + Move(C, A), // c = p(x) + Move(A, B), // a = x2 + MulConst(11), + AddConst(12), + FMA(13), + FMA(1), // a = q(x) + Recip, + Move(B,C), // b = p(x) + Mul, + AddConst(14) + ], + csts: vec![ + -18.6, // const 2 + 18.6, // const 3 + -4.433153405e-18, // const 4, also alpha_13 + 1.169974371e-14, // const 5, also a11 + -1.875289645e-11, + 4.257889523e-8, + 0.00004811817576, // const 8 + 0.008163842030, + 0.2499999971, // alpha_1 + 3.922935744e-6, // beta_6 + 0.001524872358, // const 12 + 0.1159886749, + 0.5, //beta_0 + ], + } +} + +pub fn exp2f() -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + MinConst(2), + MaxConst(3), + Move(B, A), // b = x + AddConst(4), // a = x + 0.5 + Floor, // a = ipart + Move(C, A), // c = ipart + Move(A, B), // a = x + Move(B, C), // b = ipart + Sub, // a = fpart + Move(B, A), // b = fpart + Load(A, 5), // a = exp2p[0] + FMA(6), + FMA(7), + FMA(8), + FMA(9), + FMA(10), + FMA(1), // a = y + Move(B, A), + Move(A, C), + TwoPowOfInt, + Mul + ], + csts: vec![ + 127f32, + -127f32, + 0.5, + 1.535336188319500e-4, + 1.339887440266574e-3, + 9.618437357674640e-3, + 5.550332471162809e-2, + 2.402264791363012e-1, + 6.931472028550421e-1, + ], + } +} +*/ diff --git a/linalg/src/frame/activations/reference.rs b/linalg/src/frame/activations/reference.rs new file mode 100644 index 0000000000..525fd849f0 --- /dev/null +++ b/linalg/src/frame/activations/reference.rs @@ -0,0 +1,110 @@ + +pub fn relu(x: f32) -> f32 { + x.max(0f32) +} + +pub fn affine(x: f32, alpha: f32, beta: f32) -> f32 { + alpha * x + beta +} + +pub fn leaky_relu(x: f32, alpha: f32) -> f32 { + if x > 0f32 { + x + } else { + alpha * x + } +} + +pub fn threshold_relu(x: f32, alpha: f32) -> f32 { + if x >= alpha { + x + } else { + 0f32 + } +} + +pub fn softsign(x: f32) -> f32 { + x / (1. + x.abs()) +} + +pub fn hardswish(x: f32) -> f32 { + x * 0f32.max(1f32.min((1. / 6.) * x + 0.5)) +} + +pub fn sigmoid(x: f32) -> f32 { + ssigmoid(x) +} + +pub fn ref_exp2f(x: f32) -> f32 { + 2f32.powf(x) +} + +pub fn cm_exp2f(x: f32) -> f32 { + exp2f(x) +} + +fn ssigmoid(x: f32) -> f32 { + const LOW: f32 = -18.6; + const HIGH: f32 = -LOW; + + const ALPHA_13: f32 = -4.433153405e-18; + const ALPHA_11: f32 = 1.169974371e-14; + const ALPHA_9: f32 = -1.875289645e-11; + const ALPHA_7: f32 = 4.257889523e-8; + const ALPHA_5: f32 = 0.00004811817576; + const ALPHA_3: f32 = 0.008163842030; + const ALPHA_1: f32 = 0.2499999971; + const BETA_6: f32 = 3.922935744e-6; + const BETA_4: f32 = 0.001524872358; + const BETA_2: f32 = 0.1159886749; + const BETA_0: f32 = 1.0; + + let x = x.clamp(LOW, HIGH); + + let x2 = x * x; + + let p = ALPHA_13; + let p = x2 * p + ALPHA_11; + let p = x2 * p + ALPHA_9; + let p = x2 * p + ALPHA_7; + let p = x2 * p + ALPHA_5; + let p = x2 * p + ALPHA_3; + let p = x2 * p + ALPHA_1; + let p = p * x; + + let q = BETA_6; + let q = x2 * q + BETA_4; + let q = x2 * q + BETA_2; + let q = x2 * q + BETA_0; + + p / q + 0.5 +} + +pub fn exp2f(x: f32) -> f32 { + const EXP2P: [f32; 7] = [ + 1.535336188319500e-4, + 1.339887440266574e-3, + 9.618437357674640e-3, + 5.550332471162809e-2, + 2.402264791363012e-1, + 6.931472028550421e-1, + 1.000000000000000, + ]; + + let x = x.min(127f32).max(-127f32); + + let ipart = (x + 0.5).floor(); + let fpart = x - ipart; + + // 2^ipart + let two_pow_ipart = f32::from_bits((((ipart as i32) + 127) as u32) << 23); + + let mut y = EXP2P[0]; + y = y * fpart + EXP2P[1]; + y = y * fpart + EXP2P[2]; + y = y * fpart + EXP2P[3]; + y = y * fpart + EXP2P[4]; + y = y * fpart + EXP2P[5]; + y = y * fpart + EXP2P[6]; + y * two_pow_ipart +} diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs new file mode 100644 index 0000000000..197608ff2e --- /dev/null +++ b/linalg/src/frame/activations/tests.rs @@ -0,0 +1,35 @@ +macro_rules! prop_activation { + ($cond:expr, $ti: ty, $ker: ty, $name: ident ( $($param:ident),* )) => { + proptest::proptest! { + #[test] + fn $name(x in proptest::prelude::any::<$ti>(), repeat in 1usize..4, $($param in proptest::prelude::any::<$ti>()),*) { + if $cond { + let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); + input.fill_t::<$ti>(x).unwrap(); + let prog = crate::frame::activations::definitions::$name($($param),*); + <$ker>::run(&prog.ops, &prog.csts, &mut input.as_slice_mut::<$ti>().unwrap()); + let expected = crate::frame::activations::reference::$name(x, $($param),*); + let mut output = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); + output.fill_t::<$ti>(expected).unwrap(); + output.close_enough(&input, true).unwrap(); + } + } + } + } +} + +#[macro_export] +macro_rules! act_frame_tests { + ($cond:expr, $ker:ty, $ti:ty) => { + prop_activation!($cond, $ti, $ker, relu()); + prop_activation!($cond, $ti, $ker, affine(alpha, beta)); + prop_activation!($cond, $ti, $ker, leaky_relu(alpha)); + prop_activation!($cond, $ti, $ker, threshold_relu(alpha)); + prop_activation!($cond, $ti, $ker, softsign()); + prop_activation!($cond, $ti, $ker, hardswish()); + /* + prop_activation!($cond, $ti, $ker, sigmoid()); + prop_activation!($cond, $ti, $ker, exp2f()); + */ + }; +} diff --git a/linalg/src/generic.rs b/linalg/src/generic.rs index 583ce45816..485c7d32f7 100644 --- a/linalg/src/generic.rs +++ b/linalg/src/generic.rs @@ -1,3 +1,4 @@ +pub mod activations; pub mod erf; pub mod lut; pub mod mmm; diff --git a/linalg/src/generic/activations.rs b/linalg/src/generic/activations.rs new file mode 100644 index 0000000000..c76eace069 --- /dev/null +++ b/linalg/src/generic/activations.rs @@ -0,0 +1,98 @@ +use crate::frame::activations::{ActivationKer, Op, RegisterId}; + +// TODO make the inner loop tighter +unsafe fn compute_slice(ops: *const Op, constants: *const f32, xs: *mut f32, len: usize) { + let mut a = std::slice::from_raw_parts_mut(xs, len); + let mut b = vec![0.0f32; a.len()]; + let mut c = vec![0.0f32; a.len()]; + let mut pc = ops; + loop { + match *pc { + Op::Done => break, + Op::Move(dst, src) => { + let mut regs = [&mut a, &mut *b, &mut c]; + let dst = dst as usize; + let src = src as usize; + if dst < src { + let (left, right) = regs.split_at_mut(src); + let d = &mut *left[dst]; + let s = &*right[0]; + d.copy_from_slice(s) + } else { + let (left, right) = regs.split_at_mut(dst); + let s = &*left[src]; + let d = &mut *right[0]; + d.copy_from_slice(s) + } + } + Op::Load(dst, cst) if dst == RegisterId::A => { + a.iter_mut().for_each(|x| *x = *constants.add(cst as usize)) + } + Op::Load(dst, cst) if dst == RegisterId::B => { + b.iter_mut().for_each(|x| *x = *constants.add(cst as usize)) + } + Op::Load(_dst, cst) => c.iter_mut().for_each(|x| *x = *constants.add(cst as usize)), + Op::Abs => a.iter_mut().for_each(|x| *x = x.abs()), + Op::Recip => a.iter_mut().for_each(|x| *x = x.recip()), + Op::Add => a.iter_mut().zip(&b).for_each(|(x, y)| *x += *y), + Op::Sub => a.iter_mut().zip(&b).for_each(|(x, y)| *x -= *y), + Op::Mul => a.iter_mut().zip(&b).for_each(|(x, y)| *x *= *y), + Op::Min => a.iter_mut().zip(&b).for_each(|(x, y)| *x = x.min(*y)), + Op::Max => a.iter_mut().zip(&b).for_each(|(x, y)| *x = x.max(*y)), + Op::AddConst(cst) => a.iter_mut().for_each(|x| *x += *constants.add(cst as usize)), + Op::SubConst(cst) => a.iter_mut().for_each(|x| *x -= *constants.add(cst as usize)), + Op::MulConst(cst) => a.iter_mut().for_each(|x| *x *= *constants.add(cst as usize)), + Op::MinConst(cst) => { + a.iter_mut().for_each(|x| *x = x.min(*constants.add(cst as usize))) + } + Op::MaxConst(cst) => { + a.iter_mut().for_each(|x| *x = x.max(*constants.add(cst as usize))) + } + Op::IfPosTE => a + .iter_mut() + .zip(&b) + .zip(&c) + .for_each(|((x, y), z)| *x = if *x >= 0f32 { *y } else { *z }), + Op::FMA(cst) => { + a.iter_mut().zip(&b).for_each(|(x, y)| *x = *x * *y + *constants.add(cst as usize)) + } + Op::SwapBC => b.iter_mut().zip(c.iter_mut()).for_each(|(b, c)| std::mem::swap(b, c)), + Op::Floor => a.iter_mut().for_each(|x| *x = x.floor()), + Op::TwoPowOfInt => { + a.iter_mut().for_each(|x| *x = f32::from_bits((((*x as i32) + 127) as u32) << 23)) + } + } + pc = pc.add(1); + } +} + +#[derive(Clone, Debug)] +pub struct SActivations; + +impl ActivationKer for SActivations { + fn name() -> &'static str { + "generic" + } + + fn alignment_bytes() -> usize { + 16 + } + + fn alignment_items() -> usize { + 4 + } + + fn nr() -> usize { + 4 + } + + fn run(ops: &[Op], csts: &[f32], xs: &mut [f32]) { + debug_assert!(xs.len() % Self::nr() == 0); + debug_assert!(xs.as_ptr() as usize % Self::alignment_bytes() == 0); + unsafe { compute_slice(ops.as_ptr(), csts.as_ptr(), xs.as_mut_ptr(), xs.len()) }; + } +} + +#[cfg(test)] +act_frame_tests!(true, SActivations, f32); + diff --git a/linalg/src/lib.rs b/linalg/src/lib.rs index ea66629e8e..e085baf4f1 100644 --- a/linalg/src/lib.rs +++ b/linalg/src/lib.rs @@ -156,7 +156,8 @@ pub trait LADatum: + 'static + Add + Sub - + Mul + + Mul + + Div + AddAssign + PartialOrd + Bounded From ed175598acdb1e734e4e742ff4a1febb20beae82 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 17 Apr 2023 22:04:06 +0200 Subject: [PATCH 09/25] wip impl arm64simd activ --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 29 +++++++++ linalg/src/arm64/arm64simd.rs | 3 + linalg/src/frame/activations.rs | 15 +++-- linalg/src/frame/activations/tests.rs | 61 ++++++++++++++++--- linalg/src/generic/activations.rs | 17 +++++- 5 files changed, 110 insertions(+), 15 deletions(-) create mode 100644 linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl new file mode 100644 index 0000000000..2462e373b7 --- /dev/null +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -0,0 +1,29 @@ +// vim: ft=arm + +// C tile regs: v16 to v31, (scratch) +// - x19-x29 to preserve (but x19, x28, x29 not used) +// - d8..d15 to preserve +// - v16 to v31, no need to preserve + +.text +.align 4 + +.cpu generic+fp+simd +.global {{G}}arm64simd_act_f32_32n_{{suffix}} +{{G}}arm64simd_act_f32_32n_{{suffix}}: + + stp d8, d9, [sp, #-16]! + stp d10, d11, [sp, #-16]! + stp d12, d13, [sp, #-16]! + stp d14, d15, [sp, #-16]! + + mov x0, 0 +// b .return + +.return: + ldp d14, d15, [sp], #16 + ldp d12, d13, [sp], #16 + ldp d10, d11, [sp], #16 + ldp d8, d9, [sp], #16 + + ret diff --git a/linalg/src/arm64/arm64simd.rs b/linalg/src/arm64/arm64simd.rs index ab03e9654b..8ec9d8618e 100644 --- a/linalg/src/arm64/arm64simd.rs +++ b/linalg/src/arm64/arm64simd.rs @@ -44,3 +44,6 @@ sigmoid_impl!(f32, arm64simd_sigmoid_f32_4n, 4, 4, true); tanh_impl!(f16, arm64fp16_tanh_f16_8n, 8, 8, crate::arm64::has_fp16()); #[cfg(not(feature="no_fp16"))] sigmoid_impl!(f16, arm64fp16_sigmoid_f16_8n, 8, 8, crate::arm64::has_fp16()); + +act_impl!(f32, arm64simd_act_f32_32n, 32, 4, true); + diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs index 8b604b4998..5586c16973 100644 --- a/linalg/src/frame/activations.rs +++ b/linalg/src/frame/activations.rs @@ -96,19 +96,20 @@ where } macro_rules! act_impl { - ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr) => { + ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => { paste! { mod [] { #[allow(unused_imports)] use tract_data::prelude::f16; - extern_kernel!(fn $func(ptr: *mut $ti, count: usize) -> ()); + use crate::frame::activations::Op; + extern_kernel!(fn $func(ops: *const Op, constants: *const $ti, xs: *mut $ti, len: usize) -> usize); } #[derive(Copy, Clone, Debug)] #[allow(non_camel_case_types)] pub struct $func; - impl ActivationKer<$ti> for $func { + impl $crate::frame::activations::ActivationKer<$ti> for $func { #[inline(always)] fn name() -> &'static str { stringify!($func) @@ -126,10 +127,14 @@ macro_rules! act_impl { $alignment_items * std::mem::size_of::<$ti>() } #[inline(never)] - fn run(ops: &Op, csts:&[T], buf: &mut [$ti]) { - unsafe { []::$func(ops.as_ptr(), csts.as_ptr(), buf.as_mut_ptr(), buf.len()) } + fn run(ops: &[$crate::frame::activations::Op], csts:&[$ti], buf: &mut [$ti]) { + let err = unsafe { []::$func(ops.as_ptr(), csts.as_ptr(), buf.as_mut_ptr(), buf.len()) }; + assert_eq!(err, 0); } } + + #[cfg(test)] + act_tests!($cond, $func, $ti); } }; } diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index 197608ff2e..fb916e856c 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -1,8 +1,22 @@ -macro_rules! prop_activation { +use crate::LADatum; + +use super::{Program, Op}; +use Op::*; + +pub fn noop() -> Program { + Program { ops: vec![Done], csts: vec![] } +} + +macro_rules! prop_act_e2e { ($cond:expr, $ti: ty, $ker: ty, $name: ident ( $($param:ident),* )) => { proptest::proptest! { #[test] - fn $name(x in proptest::prelude::any::<$ti>(), repeat in 1usize..4, $($param in proptest::prelude::any::<$ti>()),*) { + fn $name( + x in proptest::prelude::any::<$ti>(), + repeat in 1usize..4, + $($param in proptest::prelude::any::<$ti>()),*) + { + use crate::frame::activations::ActivationKer; if $cond { let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); input.fill_t::<$ti>(x).unwrap(); @@ -18,18 +32,47 @@ macro_rules! prop_activation { } } +macro_rules! prop_act_unit { + ($cond:expr, $ti: ty, $ker: ty, $name: ident ( $($param:ident),* ), $refer: expr) => { + proptest::proptest! { + #[test] + fn $name( + x in proptest::prelude::any::<$ti>(), + repeat in 1usize..4, + $($param in proptest::prelude::any::<$ti>()),*) + { + use crate::frame::activations::ActivationKer; + if $cond { + let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); + input.fill_t::<$ti>(x).unwrap(); + let refer2: fn($ti) -> $ti = $refer; + let expected:Vec<$ti> = input.as_slice::<$ti>().unwrap().iter().cloned().map(refer2).collect(); + let prog = crate::frame::activations::tests::$name($($param),*); + <$ker>::run(&prog.ops, &prog.csts, &mut input.as_slice_mut::<$ti>().unwrap()); + + let expected = tract_data::prelude::tensor1(&expected); + expected.close_enough(&input, true).unwrap(); + } + } + } + } +} + #[macro_export] -macro_rules! act_frame_tests { +macro_rules! act_tests { ($cond:expr, $ker:ty, $ti:ty) => { - prop_activation!($cond, $ti, $ker, relu()); - prop_activation!($cond, $ti, $ker, affine(alpha, beta)); - prop_activation!($cond, $ti, $ker, leaky_relu(alpha)); - prop_activation!($cond, $ti, $ker, threshold_relu(alpha)); - prop_activation!($cond, $ti, $ker, softsign()); - prop_activation!($cond, $ti, $ker, hardswish()); + prop_act_unit!($cond, $ti, $ker, noop(), |x| x); + + prop_act_e2e!($cond, $ti, $ker, relu()); + prop_act_e2e!($cond, $ti, $ker, affine(alpha, beta)); + prop_act_e2e!($cond, $ti, $ker, leaky_relu(alpha)); + prop_act_e2e!($cond, $ti, $ker, threshold_relu(alpha)); + prop_act_e2e!($cond, $ti, $ker, softsign()); + prop_act_e2e!($cond, $ti, $ker, hardswish()); /* prop_activation!($cond, $ti, $ker, sigmoid()); prop_activation!($cond, $ti, $ker, exp2f()); */ }; } + diff --git a/linalg/src/generic/activations.rs b/linalg/src/generic/activations.rs index c76eace069..48fccc8cf3 100644 --- a/linalg/src/generic/activations.rs +++ b/linalg/src/generic/activations.rs @@ -94,5 +94,20 @@ impl ActivationKer for SActivations { } #[cfg(test)] -act_frame_tests!(true, SActivations, f32); +act_tests!(true, SActivations, f32); +#[cfg(test)] +mod tests { + use crate::frame::activations::Op; + use crate::frame::activations::ActivationKer; + + use super::SActivations; + + #[test] + fn act_noop() { + let mut xs = vec!(1f32; SActivations::nr()); + let expect = xs.clone(); + SActivations::run(&[Op::Done], &[], &mut *xs); + assert_eq!(expect, xs); + } +} From 794940235733f8c680c8f915b744e2f1e302889a Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 17 Apr 2023 22:58:50 +0200 Subject: [PATCH 10/25] wip, broken max const --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 58 ++++++++++++++++++- linalg/src/frame/activations.rs | 8 ++- linalg/src/frame/activations/tests.rs | 16 +++-- 3 files changed, 73 insertions(+), 9 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 2462e373b7..3160c16a03 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -8,6 +8,10 @@ .text .align 4 +// fn(ops: *const Op, constants: *const $ti, xs: *mut $ti, len: usize) -> usize + +// x0 <- ops, x1 <- constant, x2 <- xs, x3 <- len(xs) + .cpu generic+fp+simd .global {{G}}arm64simd_act_f32_32n_{{suffix}} {{G}}arm64simd_act_f32_32n_{{suffix}}: @@ -16,9 +20,61 @@ stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! + + cmp x3, 0 + beq .ok + +.outer_loop: + mov x5, x0 // x5 is "pc" + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2] + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], 64 + +.inner_loop: + ldr x6, [x5] // x6 is fetched instruction at x5 + and x7, x6, 0xffff + + cmp x7, 0 + beq .end_of_inner_loop + cmp x7, 10 + beq .max_const + + b .unsupported + +.inner_loop_payload_done: + add x5, x5, 4 + b .inner_loop +.end_of_inner_loop: + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2] + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], 64 + + add x2, x2, 128 + subs x3, x3, 32 + bne .outer_loop + +.max_const: + lsr x7, x6, 16 + and x7, x7, 0xff + lsl x7, x7, 2 + add x7, x7, x1 + ld1 { v24.s }[0], [x7] + dup v24.4s, v24.s[0] + fmax v0.4s, v0.4s, v24.4s + fmax v1.4s, v1.4s, v24.4s + fmax v2.4s, v2.4s, v24.4s + fmax v3.4s, v3.4s, v24.4s + fmax v4.4s, v4.4s, v24.4s + fmax v5.4s, v5.4s, v24.4s + fmax v6.4s, v6.4s, v24.4s + fmax v7.4s, v7.4s, v24.4s + b .inner_loop_payload_done + + +.unsupported: + mov x0, 1 + b .return +.ok: mov x0, 0 -// b .return .return: ldp d14, d15, [sp], #16 diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs index 5586c16973..580731bd83 100644 --- a/linalg/src/frame/activations.rs +++ b/linalg/src/frame/activations.rs @@ -133,8 +133,12 @@ macro_rules! act_impl { } } - #[cfg(test)] - act_tests!($cond, $func, $ti); + mod [] { + use super::*; + + #[cfg(test)] + act_tests!($cond, $func, $ti); + } } }; } diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index fb916e856c..22756d4dbb 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -1,12 +1,16 @@ use crate::LADatum; -use super::{Program, Op}; +use super::{Op, Program}; use Op::*; pub fn noop() -> Program { Program { ops: vec![Done], csts: vec![] } } +pub fn max_const(c: T) -> Program { + Program { ops: vec![MaxConst(0)], csts: vec![c] } +} + macro_rules! prop_act_e2e { ($cond:expr, $ti: ty, $ker: ty, $name: ident ( $($param:ident),* )) => { proptest::proptest! { @@ -14,7 +18,7 @@ macro_rules! prop_act_e2e { fn $name( x in proptest::prelude::any::<$ti>(), repeat in 1usize..4, - $($param in proptest::prelude::any::<$ti>()),*) + $($param in proptest::prelude::any::<$ti>()),*) { use crate::frame::activations::ActivationKer; if $cond { @@ -39,14 +43,14 @@ macro_rules! prop_act_unit { fn $name( x in proptest::prelude::any::<$ti>(), repeat in 1usize..4, - $($param in proptest::prelude::any::<$ti>()),*) + $($param in proptest::prelude::any::<$ti>()),*) { use crate::frame::activations::ActivationKer; if $cond { let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); input.fill_t::<$ti>(x).unwrap(); - let refer2: fn($ti) -> $ti = $refer; - let expected:Vec<$ti> = input.as_slice::<$ti>().unwrap().iter().cloned().map(refer2).collect(); +// let refer2: fn($ti, $($param),*) -> $ti = $refer; + let expected:Vec<$ti> = input.as_slice::<$ti>().unwrap().iter().cloned().map(|x| $refer(x, $($param),*)).collect(); let prog = crate::frame::activations::tests::$name($($param),*); <$ker>::run(&prog.ops, &prog.csts, &mut input.as_slice_mut::<$ti>().unwrap()); @@ -62,6 +66,7 @@ macro_rules! prop_act_unit { macro_rules! act_tests { ($cond:expr, $ker:ty, $ti:ty) => { prop_act_unit!($cond, $ti, $ker, noop(), |x| x); + prop_act_unit!($cond, $ti, $ker, max_const(alpha), |x: $ti, alpha| x.max(alpha)); prop_act_e2e!($cond, $ti, $ker, relu()); prop_act_e2e!($cond, $ti, $ker, affine(alpha, beta)); @@ -75,4 +80,3 @@ macro_rules! act_tests { */ }; } - From 3dc125f33dc093c1414c9ee4a546b1d308104d07 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 1 May 2023 20:17:12 +0200 Subject: [PATCH 11/25] fixes to max_const --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 24 +++++------- linalg/src/frame/activations.rs | 19 ++++++---- linalg/src/frame/activations/tests.rs | 37 +++++++++++++++++-- 3 files changed, 54 insertions(+), 26 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 3160c16a03..02ffc94b45 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -9,7 +9,6 @@ .align 4 // fn(ops: *const Op, constants: *const $ti, xs: *mut $ti, len: usize) -> usize - // x0 <- ops, x1 <- constant, x2 <- xs, x3 <- len(xs) .cpu generic+fp+simd @@ -26,33 +25,31 @@ .outer_loop: mov x5, x0 // x5 is "pc" - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2] + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], 64 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], 64 + sub x2, x2, 128 .inner_loop: - ldr x6, [x5] // x6 is fetched instruction at x5 - and x7, x6, 0xffff + ldr w6, [x5], 4 // x6 is fetched instruction at x5 + and w7, w6, 0xffff - cmp x7, 0 + cmp w7, 0 beq .end_of_inner_loop - cmp x7, 10 + cmp w7, 14 beq .max_const b .unsupported -.inner_loop_payload_done: - add x5, x5, 4 - b .inner_loop .end_of_inner_loop: - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2] + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], 64 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], 64 - add x2, x2, 128 subs x3, x3, 32 bne .outer_loop + b .ok .max_const: - lsr x7, x6, 16 + lsr w7, w6, 16 and x7, x7, 0xff lsl x7, x7, 2 add x7, x7, x1 @@ -66,8 +63,7 @@ fmax v5.4s, v5.4s, v24.4s fmax v6.4s, v6.4s, v24.4s fmax v7.4s, v7.4s, v24.4s - b .inner_loop_payload_done - + b .inner_loop .unsupported: mov x0, 1 diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs index 580731bd83..b8aa3aab03 100644 --- a/linalg/src/frame/activations.rs +++ b/linalg/src/frame/activations.rs @@ -25,21 +25,21 @@ type ConstantId = u8; #[repr(C, u16)] #[derive(Copy, Clone, Debug, PartialEq)] pub enum Op { - Done, + Done, // 0 Move(RegisterId, RegisterId), Load(RegisterId, ConstantId), - Abs, + Abs, // 3 Recip, Add, - Sub, + Sub, // 6 Mul, Min, - Max, - AddConst(ConstantId), + Max, // 9 + AddConst(ConstantId), // 10 SubConst(ConstantId), MulConst(ConstantId), MinConst(ConstantId), - MaxConst(ConstantId), + MaxConst(ConstantId), // 14 FMA(ConstantId), // a <- a * b + cst IfPosTE, SwapBC, @@ -129,7 +129,7 @@ macro_rules! act_impl { #[inline(never)] fn run(ops: &[$crate::frame::activations::Op], csts:&[$ti], buf: &mut [$ti]) { let err = unsafe { []::$func(ops.as_ptr(), csts.as_ptr(), buf.as_mut_ptr(), buf.len()) }; - assert_eq!(err, 0); + assert_eq!(err, 0, "Kernel function return non zero {}", err); } } @@ -145,8 +145,11 @@ macro_rules! act_impl { #[cfg(test)] mod test { + use super::*; + #[test] fn size_of_op() { - assert_eq!(std::mem::size_of::(), 4); + assert_eq!(std::mem::size_of::(), 4); } + } diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index 22756d4dbb..1d900f8165 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -8,7 +8,7 @@ pub fn noop() -> Program { } pub fn max_const(c: T) -> Program { - Program { ops: vec![MaxConst(0)], csts: vec![c] } + Program { ops: vec![MaxConst(0), Done], csts: vec![c] } } macro_rules! prop_act_e2e { @@ -25,7 +25,7 @@ macro_rules! prop_act_e2e { let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); input.fill_t::<$ti>(x).unwrap(); let prog = crate::frame::activations::definitions::$name($($param),*); - <$ker>::run(&prog.ops, &prog.csts, &mut input.as_slice_mut::<$ti>().unwrap()); + <$ker>::run(&prog.ops, &prog.csts, input.as_slice_mut::<$ti>().unwrap()); let expected = crate::frame::activations::reference::$name(x, $($param),*); let mut output = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); output.fill_t::<$ti>(expected).unwrap(); @@ -49,10 +49,9 @@ macro_rules! prop_act_unit { if $cond { let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); input.fill_t::<$ti>(x).unwrap(); -// let refer2: fn($ti, $($param),*) -> $ti = $refer; let expected:Vec<$ti> = input.as_slice::<$ti>().unwrap().iter().cloned().map(|x| $refer(x, $($param),*)).collect(); let prog = crate::frame::activations::tests::$name($($param),*); - <$ker>::run(&prog.ops, &prog.csts, &mut input.as_slice_mut::<$ti>().unwrap()); + <$ker>::run(&prog.ops, &prog.csts, input.as_slice_mut::<$ti>().unwrap()); let expected = tract_data::prelude::tensor1(&expected); expected.close_enough(&input, true).unwrap(); @@ -68,6 +67,36 @@ macro_rules! act_tests { prop_act_unit!($cond, $ti, $ker, noop(), |x| x); prop_act_unit!($cond, $ti, $ker, max_const(alpha), |x: $ti, alpha| x.max(alpha)); + #[test] + fn max_const_0() { + use crate::frame::activations::ActivationKer; + if $cond { + let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr()], <$ker>::alignment_bytes()).unwrap(); + input.fill_t::<$ti>(0.0).unwrap(); + let expected:Vec<$ti> = input.as_slice::<$ti>().unwrap().iter().cloned().map(|x| x.max(0f32)).collect(); + let prog = crate::frame::activations::tests::max_const(0f32); + <$ker>::run(&prog.ops, &prog.csts, input.as_slice_mut::<$ti>().unwrap()); + + let expected = tract_data::prelude::tensor1(&expected); + expected.close_enough(&input, true).unwrap(); + } + } + + #[test] + fn max_const_big_alpha() { + use crate::frame::activations::ActivationKer; + if $cond { + let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr()], <$ker>::alignment_bytes()).unwrap(); + input.fill_t::<$ti>(0.0).unwrap(); + let expected:Vec<$ti> = input.as_slice::<$ti>().unwrap().iter().cloned().map(|x| x.max(7.567773e37)).collect(); + let prog = crate::frame::activations::tests::max_const(7.567773e37); + <$ker>::run(&prog.ops, &prog.csts, input.as_slice_mut::<$ti>().unwrap()); + + let expected = tract_data::prelude::tensor1(&expected); + expected.close_enough(&input, true).unwrap(); + } + } + prop_act_e2e!($cond, $ti, $ker, relu()); prop_act_e2e!($cond, $ti, $ker, affine(alpha, beta)); prop_act_e2e!($cond, $ti, $ker, leaky_relu(alpha)); From dce1b860d712332cafb1e797c6e11e25ba5fe5c3 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 1 May 2023 22:18:31 +0200 Subject: [PATCH 12/25] move constant into op stream --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 22 ++- linalg/src/frame/activations.rs | 128 +++++++++++++++--- linalg/src/frame/activations/definitions.rs | 39 ++---- linalg/src/frame/activations/tests.rs | 43 ++++-- linalg/src/generic/activations.rs | 86 ++++++------ 5 files changed, 205 insertions(+), 113 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 02ffc94b45..e5d30108ea 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -8,8 +8,8 @@ .text .align 4 -// fn(ops: *const Op, constants: *const $ti, xs: *mut $ti, len: usize) -> usize -// x0 <- ops, x1 <- constant, x2 <- xs, x3 <- len(xs) +// fn(ops: *const OpOrConst, xs: *mut $ti, len: usize) -> usize +// x0 <- ops, x1 <- xs, x2 <- len(xs) .cpu generic+fp+simd .global {{G}}arm64simd_act_f32_32n_{{suffix}} @@ -25,9 +25,9 @@ .outer_loop: mov x5, x0 // x5 is "pc" - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], 64 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], 64 - sub x2, x2, 128 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], 64 + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], 64 + sub x1, x1, 128 .inner_loop: ldr w6, [x5], 4 // x6 is fetched instruction at x5 @@ -41,19 +41,15 @@ b .unsupported .end_of_inner_loop: - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], 64 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], 64 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], 64 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], 64 - subs x3, x3, 32 + subs x2, x2, 32 bne .outer_loop b .ok .max_const: - lsr w7, w6, 16 - and x7, x7, 0xff - lsl x7, x7, 2 - add x7, x7, x1 - ld1 { v24.s }[0], [x7] + ld1 { v24.s }[0], [x5], 4 dup v24.4s, v24.s[0] fmax v0.4s, v0.4s, v24.4s fmax v1.4s, v1.4s, v24.4s diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs index b8aa3aab03..4afa3a7ca4 100644 --- a/linalg/src/frame/activations.rs +++ b/linalg/src/frame/activations.rs @@ -12,6 +12,11 @@ pub mod reference; #[macro_use] pub mod tests; +#[derive(Clone, Debug, PartialEq)] +pub struct Program { + pub ops: Vec>, +} + #[derive(Copy, Clone, Debug, PartialEq)] #[repr(u8)] pub enum RegisterId { @@ -20,37 +25,116 @@ pub enum RegisterId { C = 2, } -type ConstantId = u8; +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum Op { + Move(RegisterId, RegisterId), + Load(RegisterId, T), + Abs, // 3 + Recip, + Add, + Sub, // 6 + Mul, + Min, + Max, // 9 + AddConst(T), // 10 + SubConst(T), + MulConst(T), + MinConst(T), + MaxConst(T), // 14 + FMA(T), // a <- a * b + cst + IfPosTE, + SwapBC, + Floor, + TwoPowOfInt, +} + +impl Program { + pub fn translate(&self) -> KerProgram { + let mut ops: Vec> = vec![]; + for op in &self.ops { + match op { + Op::Move(a, b) => ops.push(OpOrConst { op: KerOp::Move(*a, *b) }), + Op::Load(a, t) => { + ops.push(OpOrConst { op: KerOp::Load(*a) }); + ops.push(OpOrConst { t: *t }); + } + Op::Abs => ops.push(OpOrConst { op: KerOp::Abs }), + Op::Recip => ops.push(OpOrConst { op: KerOp::Recip }), + Op::Add => ops.push(OpOrConst { op: KerOp::Add }), + Op::Sub => ops.push(OpOrConst { op: KerOp::Sub }), // 6 + Op::Mul => ops.push(OpOrConst { op: KerOp::Mul }), + Op::Min => ops.push(OpOrConst { op: KerOp::Min }), + Op::Max => ops.push(OpOrConst { op: KerOp::Max }), // 9 + Op::AddConst(t) => { + ops.push(OpOrConst { op: KerOp::AddConst }); + ops.push(OpOrConst { t: *t }); + } + Op::SubConst(t) => { + ops.push(OpOrConst { op: KerOp::SubConst }); + ops.push(OpOrConst { t: *t }); + } + Op::MulConst(t) => { + ops.push(OpOrConst { op: KerOp::MulConst }); + ops.push(OpOrConst { t: *t }); + } + Op::MinConst(t) => { + ops.push(OpOrConst { op: KerOp::MinConst }); + ops.push(OpOrConst { t: *t }); + } + Op::MaxConst(t) => { + ops.push(OpOrConst { op: KerOp::MaxConst }); + ops.push(OpOrConst { t: *t }); + } + Op::FMA(t) => { + ops.push(OpOrConst { op: KerOp::FMA }); + ops.push(OpOrConst { t: *t }); + } + Op::IfPosTE => ops.push(OpOrConst { op: KerOp::IfPosTE }), + Op::SwapBC => ops.push(OpOrConst { op: KerOp::SwapBC }), + Op::Floor => ops.push(OpOrConst { op: KerOp::Floor }), + Op::TwoPowOfInt => ops.push(OpOrConst { op: KerOp::TwoPowOfInt }), + } + } + ops.push(OpOrConst { op: KerOp::Done }); + KerProgram { ops } + } +} #[repr(C, u16)] #[derive(Copy, Clone, Debug, PartialEq)] -pub enum Op { +pub enum KerOp { Done, // 0 Move(RegisterId, RegisterId), - Load(RegisterId, ConstantId), + Load(RegisterId), Abs, // 3 Recip, Add, Sub, // 6 Mul, Min, - Max, // 9 - AddConst(ConstantId), // 10 - SubConst(ConstantId), - MulConst(ConstantId), - MinConst(ConstantId), - MaxConst(ConstantId), // 14 - FMA(ConstantId), // a <- a * b + cst + Max, // 9 + AddConst, // 10 + SubConst, + MulConst, + MinConst, + MaxConst, // 14 + FMA, // a <- a * b + cst IfPosTE, SwapBC, Floor, TwoPowOfInt, } -#[derive(Clone, Debug, PartialEq)] -pub struct Program { - pub ops: Vec, - pub csts: Vec, +#[derive(Clone)] +pub struct KerProgram { + pub ops: Vec>, +} + +#[repr(C)] +#[derive(Clone, Copy)] +pub union OpOrConst { + pub op: KerOp, + pub t: T, } pub trait Activation: Send + Sync + Debug + dyn_clone::DynClone { @@ -72,9 +156,10 @@ where K: ActivationKer + Clone, { fn run(&self, program: &Program, vec: &mut [T]) -> TractResult<()> { + let ker_program = program.translate(); run_over_slice_with_alignment( vec, - |slice| K::run(&program.ops, &*program.csts, slice), + |slice| K::run(&ker_program.ops, slice), K::nr(), K::alignment_bytes(), ) @@ -89,7 +174,7 @@ where fn alignment_bytes() -> usize; fn alignment_items() -> usize; fn nr() -> usize; - fn run(ops: &[Op], csts: &[T], vec: &mut [T]); + fn run(ops: &[OpOrConst], vec: &mut [T]); fn act() -> Box> { Box::new(ActivationImpl::::new()) } @@ -101,8 +186,8 @@ macro_rules! act_impl { mod [] { #[allow(unused_imports)] use tract_data::prelude::f16; - use crate::frame::activations::Op; - extern_kernel!(fn $func(ops: *const Op, constants: *const $ti, xs: *mut $ti, len: usize) -> usize); + use $crate::frame::activations::OpOrConst; + extern_kernel!(fn $func(ops: *const OpOrConst<$ti>, xs: *mut $ti, len: usize) -> usize); } #[derive(Copy, Clone, Debug)] @@ -127,8 +212,8 @@ macro_rules! act_impl { $alignment_items * std::mem::size_of::<$ti>() } #[inline(never)] - fn run(ops: &[$crate::frame::activations::Op], csts:&[$ti], buf: &mut [$ti]) { - let err = unsafe { []::$func(ops.as_ptr(), csts.as_ptr(), buf.as_mut_ptr(), buf.len()) }; + fn run(ops: &[$crate::frame::activations::OpOrConst<$ti>], buf: &mut [$ti]) { + let err = unsafe { []::$func(ops.as_ptr(), buf.as_mut_ptr(), buf.len()) }; assert_eq!(err, 0, "Kernel function return non zero {}", err); } } @@ -149,7 +234,6 @@ mod test { #[test] fn size_of_op() { - assert_eq!(std::mem::size_of::(), 4); + assert_eq!(std::mem::size_of::>(), 4); } - } diff --git a/linalg/src/frame/activations/definitions.rs b/linalg/src/frame/activations/definitions.rs index e1f0ff8e59..40dc7bff84 100644 --- a/linalg/src/frame/activations/definitions.rs +++ b/linalg/src/frame/activations/definitions.rs @@ -3,18 +3,16 @@ use super::RegisterId::*; use super::*; pub fn relu() -> Program { - Program { ops: vec![MaxConst(0), Done], csts: vec![T::zero()] } + Program { ops: vec![MaxConst(T::zero())] } } pub fn affine(alpha: T, beta: T) -> Program { Program { #[rustfmt::skip] ops: vec![ - MulConst(0), - AddConst(1), - Done, + MulConst(alpha), + AddConst(beta), ], - csts: vec![alpha, beta], } } @@ -23,13 +21,11 @@ pub fn leaky_relu(alpha: T) -> Program { #[rustfmt::skip] ops: vec![ Move(B,A), - MulConst(0), + MulConst(alpha), Move(C,A), Move(A,B), IfPosTE, - Done, ], - csts: vec![alpha], } } @@ -38,12 +34,10 @@ pub fn threshold_relu(alpha: T) -> Program { #[rustfmt::skip] ops: vec![ Move(B,A), - SubConst(1), - Load(C,0), + SubConst(alpha), + Load(C, T::zero()), IfPosTE, - Done, ], - csts: vec![T::zero(), alpha], } } @@ -53,33 +47,26 @@ pub fn softsign() -> Program { ops: vec![ Move(B,A), Abs, - AddConst(0), + AddConst(T::one()), Recip, Mul, - Done, ], - csts: vec![T::one()], } } pub fn hardswish() -> Program { + let one_sixth = T::one() / (T::one() + T::one() + T::one() + T::one() + T::one() + T::one()); + let one_half = T::one() / (T::one() + T::one()); Program { #[rustfmt::skip] ops: vec![ Move(B, A), - MulConst(2), - AddConst(3), - MinConst(1), - MaxConst(0), + MulConst(one_sixth), + AddConst(one_half), + MinConst(T::one()), + MaxConst(T::zero()), Mul, - Done, ], - csts: vec![ - T::zero(), - T::one(), - T::one() / (T::one() + T::one() + T::one() + T::one() + T::one() + T::one()), // 1/6 - T::one() / (T::one() + T::one()), // 1/2 - ], } } diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index 1d900f8165..9b2d8750d8 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -4,11 +4,11 @@ use super::{Op, Program}; use Op::*; pub fn noop() -> Program { - Program { ops: vec![Done], csts: vec![] } + Program { ops: vec![] } } pub fn max_const(c: T) -> Program { - Program { ops: vec![MaxConst(0), Done], csts: vec![c] } + Program { ops: vec![MaxConst(c)] } } macro_rules! prop_act_e2e { @@ -24,8 +24,8 @@ macro_rules! prop_act_e2e { if $cond { let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); input.fill_t::<$ti>(x).unwrap(); - let prog = crate::frame::activations::definitions::$name($($param),*); - <$ker>::run(&prog.ops, &prog.csts, input.as_slice_mut::<$ti>().unwrap()); + let prog = crate::frame::activations::definitions::$name($($param),*).translate(); + <$ker>::run(&prog.ops, input.as_slice_mut::<$ti>().unwrap()); let expected = crate::frame::activations::reference::$name(x, $($param),*); let mut output = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); output.fill_t::<$ti>(expected).unwrap(); @@ -50,8 +50,8 @@ macro_rules! prop_act_unit { let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); input.fill_t::<$ti>(x).unwrap(); let expected:Vec<$ti> = input.as_slice::<$ti>().unwrap().iter().cloned().map(|x| $refer(x, $($param),*)).collect(); - let prog = crate::frame::activations::tests::$name($($param),*); - <$ker>::run(&prog.ops, &prog.csts, input.as_slice_mut::<$ti>().unwrap()); + let prog = crate::frame::activations::tests::$name($($param),*).translate(); + <$ker>::run(&prog.ops, input.as_slice_mut::<$ti>().unwrap()); let expected = tract_data::prelude::tensor1(&expected); expected.close_enough(&input, true).unwrap(); @@ -71,11 +71,16 @@ macro_rules! act_tests { fn max_const_0() { use crate::frame::activations::ActivationKer; if $cond { - let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr()], <$ker>::alignment_bytes()).unwrap(); + let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>( + &[<$ker>::nr()], + <$ker>::alignment_bytes(), + ) + .unwrap(); input.fill_t::<$ti>(0.0).unwrap(); - let expected:Vec<$ti> = input.as_slice::<$ti>().unwrap().iter().cloned().map(|x| x.max(0f32)).collect(); - let prog = crate::frame::activations::tests::max_const(0f32); - <$ker>::run(&prog.ops, &prog.csts, input.as_slice_mut::<$ti>().unwrap()); + let expected: Vec<$ti> = + input.as_slice::<$ti>().unwrap().iter().cloned().map(|x| x.max(0f32)).collect(); + let prog = crate::frame::activations::tests::max_const(0f32).translate(); + <$ker>::run(&prog.ops, input.as_slice_mut::<$ti>().unwrap()); let expected = tract_data::prelude::tensor1(&expected); expected.close_enough(&input, true).unwrap(); @@ -86,11 +91,21 @@ macro_rules! act_tests { fn max_const_big_alpha() { use crate::frame::activations::ActivationKer; if $cond { - let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr()], <$ker>::alignment_bytes()).unwrap(); + let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>( + &[<$ker>::nr()], + <$ker>::alignment_bytes(), + ) + .unwrap(); input.fill_t::<$ti>(0.0).unwrap(); - let expected:Vec<$ti> = input.as_slice::<$ti>().unwrap().iter().cloned().map(|x| x.max(7.567773e37)).collect(); - let prog = crate::frame::activations::tests::max_const(7.567773e37); - <$ker>::run(&prog.ops, &prog.csts, input.as_slice_mut::<$ti>().unwrap()); + let expected: Vec<$ti> = input + .as_slice::<$ti>() + .unwrap() + .iter() + .cloned() + .map(|x| x.max(7.567773e37)) + .collect(); + let prog = crate::frame::activations::tests::max_const(7.567773e37).translate(); + <$ker>::run(&prog.ops, input.as_slice_mut::<$ti>().unwrap()); let expected = tract_data::prelude::tensor1(&expected); expected.close_enough(&input, true).unwrap(); diff --git a/linalg/src/generic/activations.rs b/linalg/src/generic/activations.rs index 48fccc8cf3..0870e4714e 100644 --- a/linalg/src/generic/activations.rs +++ b/linalg/src/generic/activations.rs @@ -1,15 +1,16 @@ -use crate::frame::activations::{ActivationKer, Op, RegisterId}; +use crate::frame::activations::{ActivationKer, KerOp, OpOrConst, RegisterId}; // TODO make the inner loop tighter -unsafe fn compute_slice(ops: *const Op, constants: *const f32, xs: *mut f32, len: usize) { +unsafe fn compute_slice(ops: *const OpOrConst, xs: *mut f32, len: usize) { let mut a = std::slice::from_raw_parts_mut(xs, len); let mut b = vec![0.0f32; a.len()]; let mut c = vec![0.0f32; a.len()]; let mut pc = ops; loop { - match *pc { - Op::Done => break, - Op::Move(dst, src) => { + let op = (*pc).op; + match op { + KerOp::Done => break, + KerOp::Move(dst, src) => { let mut regs = [&mut a, &mut *b, &mut c]; let dst = dst as usize; let src = src as usize; @@ -25,40 +26,48 @@ unsafe fn compute_slice(ops: *const Op, constants: *const f32, xs: *mut f32, len d.copy_from_slice(s) } } - Op::Load(dst, cst) if dst == RegisterId::A => { - a.iter_mut().for_each(|x| *x = *constants.add(cst as usize)) - } - Op::Load(dst, cst) if dst == RegisterId::B => { - b.iter_mut().for_each(|x| *x = *constants.add(cst as usize)) - } - Op::Load(_dst, cst) => c.iter_mut().for_each(|x| *x = *constants.add(cst as usize)), - Op::Abs => a.iter_mut().for_each(|x| *x = x.abs()), - Op::Recip => a.iter_mut().for_each(|x| *x = x.recip()), - Op::Add => a.iter_mut().zip(&b).for_each(|(x, y)| *x += *y), - Op::Sub => a.iter_mut().zip(&b).for_each(|(x, y)| *x -= *y), - Op::Mul => a.iter_mut().zip(&b).for_each(|(x, y)| *x *= *y), - Op::Min => a.iter_mut().zip(&b).for_each(|(x, y)| *x = x.min(*y)), - Op::Max => a.iter_mut().zip(&b).for_each(|(x, y)| *x = x.max(*y)), - Op::AddConst(cst) => a.iter_mut().for_each(|x| *x += *constants.add(cst as usize)), - Op::SubConst(cst) => a.iter_mut().for_each(|x| *x -= *constants.add(cst as usize)), - Op::MulConst(cst) => a.iter_mut().for_each(|x| *x *= *constants.add(cst as usize)), - Op::MinConst(cst) => { - a.iter_mut().for_each(|x| *x = x.min(*constants.add(cst as usize))) + KerOp::Load(dst) => { + pc = pc.add(1); + let t = (*pc).t; + match dst { + RegisterId::A => a.iter_mut().for_each(|x| *x = t), + RegisterId::B => b.iter_mut().for_each(|x| *x = t), + RegisterId::C => c.iter_mut().for_each(|x| *x = t), + } } - Op::MaxConst(cst) => { - a.iter_mut().for_each(|x| *x = x.max(*constants.add(cst as usize))) + KerOp::Abs => a.iter_mut().for_each(|x| *x = x.abs()), + KerOp::Recip => a.iter_mut().for_each(|x| *x = x.recip()), + KerOp::Add => a.iter_mut().zip(&b).for_each(|(x, y)| *x += *y), + KerOp::Sub => a.iter_mut().zip(&b).for_each(|(x, y)| *x -= *y), + KerOp::Mul => a.iter_mut().zip(&b).for_each(|(x, y)| *x *= *y), + KerOp::Min => a.iter_mut().zip(&b).for_each(|(x, y)| *x = x.min(*y)), + KerOp::Max => a.iter_mut().zip(&b).for_each(|(x, y)| *x = x.max(*y)), + KerOp::AddConst + | KerOp::SubConst + | KerOp::MulConst + | KerOp::MinConst + | KerOp::MaxConst + | KerOp::FMA => { + pc = pc.add(1); + let t = (*pc).t; + match op { + KerOp::AddConst => a.iter_mut().for_each(|x| *x += t), + KerOp::SubConst => a.iter_mut().for_each(|x| *x -= t), + KerOp::MulConst => a.iter_mut().for_each(|x| *x *= t), + KerOp::MinConst => a.iter_mut().for_each(|x| *x = x.min(t)), + KerOp::MaxConst => a.iter_mut().for_each(|x| *x = x.max(t)), + KerOp::FMA => a.iter_mut().zip(&b).for_each(|(x, y)| *x = *x * *y + t), + _ => unreachable!(), + } } - Op::IfPosTE => a + KerOp::IfPosTE => a .iter_mut() .zip(&b) .zip(&c) .for_each(|((x, y), z)| *x = if *x >= 0f32 { *y } else { *z }), - Op::FMA(cst) => { - a.iter_mut().zip(&b).for_each(|(x, y)| *x = *x * *y + *constants.add(cst as usize)) - } - Op::SwapBC => b.iter_mut().zip(c.iter_mut()).for_each(|(b, c)| std::mem::swap(b, c)), - Op::Floor => a.iter_mut().for_each(|x| *x = x.floor()), - Op::TwoPowOfInt => { + KerOp::SwapBC => b.iter_mut().zip(c.iter_mut()).for_each(|(b, c)| std::mem::swap(b, c)), + KerOp::Floor => a.iter_mut().for_each(|x| *x = x.floor()), + KerOp::TwoPowOfInt => { a.iter_mut().for_each(|x| *x = f32::from_bits((((*x as i32) + 127) as u32) << 23)) } } @@ -86,10 +95,10 @@ impl ActivationKer for SActivations { 4 } - fn run(ops: &[Op], csts: &[f32], xs: &mut [f32]) { + fn run(ops: &[OpOrConst], xs: &mut [f32]) { debug_assert!(xs.len() % Self::nr() == 0); debug_assert!(xs.as_ptr() as usize % Self::alignment_bytes() == 0); - unsafe { compute_slice(ops.as_ptr(), csts.as_ptr(), xs.as_mut_ptr(), xs.len()) }; + unsafe { compute_slice(ops.as_ptr(), xs.as_mut_ptr(), xs.len()) }; } } @@ -98,16 +107,17 @@ act_tests!(true, SActivations, f32); #[cfg(test)] mod tests { - use crate::frame::activations::Op; use crate::frame::activations::ActivationKer; + use crate::frame::activations::KerOp; + use crate::frame::activations::OpOrConst; use super::SActivations; #[test] fn act_noop() { - let mut xs = vec!(1f32; SActivations::nr()); + let mut xs = vec![1f32; SActivations::nr()]; let expect = xs.clone(); - SActivations::run(&[Op::Done], &[], &mut *xs); + SActivations::run(&[OpOrConst { op: KerOp::Done }], &mut *xs); assert_eq!(expect, xs); } } From 00623543cacbb4343f2a83effe6abab089912581 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 8 May 2023 20:18:27 +0200 Subject: [PATCH 13/25] read two instruction slots in the main loop --- linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl | 8 +++++--- linalg/src/frame/activations.rs | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index e5d30108ea..0f454584d2 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -20,7 +20,7 @@ stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! - cmp x3, 0 + cmp x2, 0 beq .ok .outer_loop: @@ -30,7 +30,8 @@ sub x1, x1, 128 .inner_loop: - ldr w6, [x5], 4 // x6 is fetched instruction at x5 + ldp w6, w3, [x5], 4 // x6 is fetched instruction at x5, w3 is next or const + // read 8, but only move by 4, as the second word could be a const and w7, w6, 0xffff cmp w7, 0 @@ -49,7 +50,8 @@ b .ok .max_const: - ld1 { v24.s }[0], [x5], 4 + ins v24.s[0], w3 + add x5, x5, 4 dup v24.4s, v24.s[0] fmax v0.4s, v0.4s, v24.4s fmax v1.4s, v1.4s, v24.4s diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs index 4afa3a7ca4..2bb737c50c 100644 --- a/linalg/src/frame/activations.rs +++ b/linalg/src/frame/activations.rs @@ -96,6 +96,7 @@ impl Program { } } ops.push(OpOrConst { op: KerOp::Done }); + ops.push(OpOrConst { op: KerOp::Done }); // add a second one to help with pair load KerProgram { ops } } } From 0145ec735f75a264111c1d923f335f6b134ca5f0 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 8 May 2023 20:51:11 +0200 Subject: [PATCH 14/25] automagic jump table generation --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 53 ++++++++++++++++--- linalg/build.rs | 9 ++-- linalg/src/frame/activations.rs | 42 ++++++++------- 3 files changed, 73 insertions(+), 31 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 0f454584d2..15b1ea7ff4 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -33,15 +33,16 @@ ldp w6, w3, [x5], 4 // x6 is fetched instruction at x5, w3 is next or const // read 8, but only move by 4, as the second word could be a const and w7, w6, 0xffff + adr x4, .jmp_table + add x4, x4, x7, LSL#2 + br x4 - cmp w7, 0 - beq .end_of_inner_loop - cmp w7, 14 - beq .max_const +.jmp_table: +{% for j in jump_table_act %} + b .{{j}} +{% endfor %} - b .unsupported - -.end_of_inner_loop: +.done: st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], 64 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], 64 @@ -49,6 +50,33 @@ bne .outer_loop b .ok +.move: + b .unsupported +.load: + b .unsupported +.abs: + b .unsupported +.recip: + b .unsupported +.add: + b .unsupported +.sub: + b .unsupported +.mul: + b .unsupported +.min: + b .unsupported +.max: + b .unsupported +.add_const: + b .unsupported +.sub_const: + b .unsupported +.mul_const: + b .unsupported +.min_const: + b .unsupported + .max_const: ins v24.s[0], w3 add x5, x5, 4 @@ -63,6 +91,17 @@ fmax v7.4s, v7.4s, v24.4s b .inner_loop +.fma: + b .unsupported +.if_pos_then_else: + b .unsupported +.swap_b_c: + b .unsupported +.floor: + b .unsupported +.two_pow_of_int: + b .unsupported + .unsupported: mov x0, 1 b .return diff --git a/linalg/build.rs b/linalg/build.rs index 55c526c381..76b9a6bd2c 100644 --- a/linalg/build.rs +++ b/linalg/build.rs @@ -17,9 +17,9 @@ fn use_masm() -> bool { env::var("CARGO_CFG_TARGET_ENV") == Ok("msvc".to_string()) && var("HOST").contains("-windows-") } -fn jump_table() -> Vec { - println!("cargo:rerun-if-changed=src/frame/mmm/fuse.rs"); - std::fs::read_to_string("src/frame/mmm/fuse.rs") +fn jump_table(rust_file: &str) -> Vec { + println!("cargo:rerun-if-changed={rust_file}"); + std::fs::read_to_string(rust_file) .unwrap() .lines() .filter(|l| l.contains("// jump_to:")) @@ -274,7 +274,8 @@ fn preprocess_file( "G": g, "suffix": suffix, "long": long, - "jump_table": jump_table(), + "jump_table": jump_table("src/frame/mmm/fuse.rs"), + "jump_table_act": jump_table("src/frame/activations.rs"), "align": align, }); for (k, v) in variants { diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs index 2bb737c50c..f25fa3bf02 100644 --- a/linalg/src/frame/activations.rs +++ b/linalg/src/frame/activations.rs @@ -103,27 +103,29 @@ impl Program { #[repr(C, u16)] #[derive(Copy, Clone, Debug, PartialEq)] +#[rustfmt::skip] pub enum KerOp { - Done, // 0 - Move(RegisterId, RegisterId), - Load(RegisterId), - Abs, // 3 - Recip, - Add, - Sub, // 6 - Mul, - Min, - Max, // 9 - AddConst, // 10 - SubConst, - MulConst, - MinConst, - MaxConst, // 14 - FMA, // a <- a * b + cst - IfPosTE, - SwapBC, - Floor, - TwoPowOfInt, + Done, // jump_to:done + Move(RegisterId, RegisterId), // jump_to:move + Load(RegisterId), // jump_to:load + Abs, // jump_to:abs + Recip, // jump_to:recip + Add, // jump_to:add + Sub, // jump_to:sub + Mul, // jump_to:mul + Min, // jump_to:min + Max, // jump_to:max + AddConst, // jump_to:add_const + SubConst, // jump_to:sub_const + MulConst, // jump_to:mul_const + MinConst, // jump_to:min_const + MaxConst, // jump_to:max_const + // a <- a * b + cst + FMA, // jump_to:fma + IfPosTE, // jump_to:if_pos_then_else + SwapBC, // jump_to:swap_b_c + Floor, // jump_to:floor + TwoPowOfInt, // jump_to:two_pow_of_int } #[derive(Clone)] From de1459a9c6ed7050c63d765c68a25b4a8ab48c4a Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 8 May 2023 21:59:35 +0200 Subject: [PATCH 15/25] better test expression --- linalg/src/frame/activations.rs | 6 +- linalg/src/frame/activations/tests.rs | 188 ++++++++++++-------------- linalg/src/generic/activations.rs | 6 +- 3 files changed, 93 insertions(+), 107 deletions(-) diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs index f25fa3bf02..965b8a8890 100644 --- a/linalg/src/frame/activations.rs +++ b/linalg/src/frame/activations.rs @@ -183,6 +183,7 @@ where } } +#[allow(unused_macros)] macro_rules! act_impl { ($ti: ident, $func: ident, $nr: expr, $alignment_items: expr, $cond: expr) => { paste! { @@ -221,10 +222,9 @@ macro_rules! act_impl { } } + #[cfg(test)] mod [] { - use super::*; - - #[cfg(test)] + pub use super::*; act_tests!($cond, $func, $ti); } } diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index 9b2d8750d8..1e91ab414a 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -1,6 +1,6 @@ use crate::LADatum; -use super::{Op, Program}; +use super::{ActivationKer, Op, Program}; use Op::*; pub fn noop() -> Program { @@ -11,116 +11,104 @@ pub fn max_const(c: T) -> Program { Program { ops: vec![MaxConst(c)] } } -macro_rules! prop_act_e2e { - ($cond:expr, $ti: ty, $ker: ty, $name: ident ( $($param:ident),* )) => { - proptest::proptest! { - #[test] - fn $name( - x in proptest::prelude::any::<$ti>(), - repeat in 1usize..4, - $($param in proptest::prelude::any::<$ti>()),*) - { - use crate::frame::activations::ActivationKer; - if $cond { - let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); - input.fill_t::<$ti>(x).unwrap(); - let prog = crate::frame::activations::definitions::$name($($param),*).translate(); - <$ker>::run(&prog.ops, input.as_slice_mut::<$ti>().unwrap()); - let expected = crate::frame::activations::reference::$name(x, $($param),*); - let mut output = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); - output.fill_t::<$ti>(expected).unwrap(); - output.close_enough(&input, true).unwrap(); - } - } - } - } -} - -macro_rules! prop_act_unit { - ($cond:expr, $ti: ty, $ker: ty, $name: ident ( $($param:ident),* ), $refer: expr) => { - proptest::proptest! { - #[test] - fn $name( - x in proptest::prelude::any::<$ti>(), - repeat in 1usize..4, - $($param in proptest::prelude::any::<$ti>()),*) - { - use crate::frame::activations::ActivationKer; - if $cond { - let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>(&[<$ker>::nr() * repeat], <$ker>::alignment_bytes()).unwrap(); - input.fill_t::<$ti>(x).unwrap(); - let expected:Vec<$ti> = input.as_slice::<$ti>().unwrap().iter().cloned().map(|x| $refer(x, $($param),*)).collect(); - let prog = crate::frame::activations::tests::$name($($param),*).translate(); - <$ker>::run(&prog.ops, input.as_slice_mut::<$ti>().unwrap()); - - let expected = tract_data::prelude::tensor1(&expected); - expected.close_enough(&input, true).unwrap(); - } - } - } - } +pub fn run_kernel_test>( + input: &[TI], + prog: &[Op], + refer: impl Fn(TI) -> TI, +) { + let mut tensor = + tract_data::prelude::Tensor::zero_aligned::(&[input.len()], K::alignment_bytes()) + .unwrap(); + tensor.as_slice_mut::().unwrap().copy_from_slice(input); + let expected: Vec = input.iter().cloned().map(|x| refer(x)).collect(); + let expected = tract_data::prelude::tensor1(&expected); + let prog = Program { ops: prog.to_vec() }; + let prog = prog.translate(); + K::run(&prog.ops, tensor.as_slice_mut::().unwrap()); + expected.close_enough(&tensor, true).unwrap(); } #[macro_export] macro_rules! act_tests { ($cond:expr, $ker:ty, $ti:ty) => { - prop_act_unit!($cond, $ti, $ker, noop(), |x| x); - prop_act_unit!($cond, $ti, $ker, max_const(alpha), |x: $ti, alpha| x.max(alpha)); + mod acttest { + #[allow(unused_imports)] + use super::*; + use $crate::frame::activations::ActivationKer; + use $crate::frame::activations::tests::*; + use $crate::frame::activations::Op::*; + use num_traits::Zero; + use proptest::prelude::*; + use proptest::collection::vec; - #[test] - fn max_const_0() { - use crate::frame::activations::ActivationKer; - if $cond { - let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>( - &[<$ker>::nr()], - <$ker>::alignment_bytes(), - ) - .unwrap(); - input.fill_t::<$ti>(0.0).unwrap(); - let expected: Vec<$ti> = - input.as_slice::<$ti>().unwrap().iter().cloned().map(|x| x.max(0f32)).collect(); - let prog = crate::frame::activations::tests::max_const(0f32).translate(); - <$ker>::run(&prog.ops, input.as_slice_mut::<$ti>().unwrap()); + fn x_strat() -> impl Strategy> { + (1usize..4).prop_flat_map(|repeat| { + let size = <$ker>::nr() * repeat; + vec(any::<$ti>(), size..size+1) + }) + } - let expected = tract_data::prelude::tensor1(&expected); - expected.close_enough(&input, true).unwrap(); + proptest::proptest! { + #[test] + fn noop(x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[], |x| x); + } + } + + #[test] + fn max_const_prop(alpha in any::<$ti>(), x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[MaxConst(alpha)], |x| x.max(alpha)); + } + } + } + + #[test] + fn max_const_zero() { + if $cond { + run_kernel_test::<$ti, $ker>( + &vec!(<$ti>::zero(); <$ker>::nr()), + &[MaxConst(<$ti>::zero())], + |x| x.max(<$ti>::zero()) + ); + } } - } - #[test] - fn max_const_big_alpha() { - use crate::frame::activations::ActivationKer; - if $cond { - let mut input = tract_data::prelude::Tensor::zero_aligned::<$ti>( - &[<$ker>::nr()], - <$ker>::alignment_bytes(), - ) - .unwrap(); - input.fill_t::<$ti>(0.0).unwrap(); - let expected: Vec<$ti> = input - .as_slice::<$ti>() - .unwrap() - .iter() - .cloned() - .map(|x| x.max(7.567773e37)) - .collect(); - let prog = crate::frame::activations::tests::max_const(7.567773e37).translate(); - <$ker>::run(&prog.ops, input.as_slice_mut::<$ti>().unwrap()); + #[test] + fn max_const_big_alpha() { + if $cond { + run_kernel_test::<$ti, $ker>( + &vec!(<$ti>::zero(); <$ker>::nr()), + &[MaxConst(7.567773e37.into())], + |x| x.max(7.567773e37.into()) + ); + } + } - let expected = tract_data::prelude::tensor1(&expected); - expected.close_enough(&input, true).unwrap(); + proptest::proptest! { + #[test] + fn relu_prop(x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::relu().ops, + |x| x.max(<$ti>::zero()) + ); + } + } } + /* + prop_act_e2e!($cond, $ti, $ker, affine(alpha, beta)); + prop_act_e2e!($cond, $ti, $ker, leaky_relu(alpha)); + prop_act_e2e!($cond, $ti, $ker, threshold_relu(alpha)); + prop_act_e2e!($cond, $ti, $ker, softsign()); + prop_act_e2e!($cond, $ti, $ker, hardswish()); + /* + prop_activation!($cond, $ti, $ker, sigmoid()); + prop_activation!($cond, $ti, $ker, exp2f()); + */ + */ } - - prop_act_e2e!($cond, $ti, $ker, relu()); - prop_act_e2e!($cond, $ti, $ker, affine(alpha, beta)); - prop_act_e2e!($cond, $ti, $ker, leaky_relu(alpha)); - prop_act_e2e!($cond, $ti, $ker, threshold_relu(alpha)); - prop_act_e2e!($cond, $ti, $ker, softsign()); - prop_act_e2e!($cond, $ti, $ker, hardswish()); - /* - prop_activation!($cond, $ti, $ker, sigmoid()); - prop_activation!($cond, $ti, $ker, exp2f()); - */ }; } diff --git a/linalg/src/generic/activations.rs b/linalg/src/generic/activations.rs index 0870e4714e..1cbb8c02ee 100644 --- a/linalg/src/generic/activations.rs +++ b/linalg/src/generic/activations.rs @@ -102,17 +102,15 @@ impl ActivationKer for SActivations { } } -#[cfg(test)] -act_tests!(true, SActivations, f32); - #[cfg(test)] mod tests { use crate::frame::activations::ActivationKer; use crate::frame::activations::KerOp; use crate::frame::activations::OpOrConst; - use super::SActivations; + act_tests!(true, crate::generic::activations::SActivations, f32); + #[test] fn act_noop() { let mut xs = vec![1f32; SActivations::nr()]; From 865a1bcd2987c56a5ee669e02dd386ad101e132f Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 8 May 2023 22:11:28 +0200 Subject: [PATCH 16/25] affine, add_const, mul_const --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 26 +++++++++++++++++-- linalg/src/frame/activations/tests.rs | 25 ++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 15b1ea7ff4..3b85f31668 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -69,11 +69,33 @@ .max: b .unsupported .add_const: - b .unsupported + ins v24.s[0], w3 + add x5, x5, 4 + dup v24.4s, v24.s[0] + fadd v0.4s, v0.4s, v24.4s + fadd v1.4s, v1.4s, v24.4s + fadd v2.4s, v2.4s, v24.4s + fadd v3.4s, v3.4s, v24.4s + fadd v4.4s, v4.4s, v24.4s + fadd v5.4s, v5.4s, v24.4s + fadd v6.4s, v6.4s, v24.4s + fadd v7.4s, v7.4s, v24.4s + b .inner_loop .sub_const: b .unsupported .mul_const: - b .unsupported + ins v24.s[0], w3 + add x5, x5, 4 + dup v24.4s, v24.s[0] + fmul v0.4s, v0.4s, v24.4s + fmul v1.4s, v1.4s, v24.4s + fmul v2.4s, v2.4s, v24.4s + fmul v3.4s, v3.4s, v24.4s + fmul v4.4s, v4.4s, v24.4s + fmul v5.4s, v5.4s, v24.4s + fmul v6.4s, v6.4s, v24.4s + fmul v7.4s, v7.4s, v24.4s + b .inner_loop .min_const: b .unsupported diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index 1e91ab414a..795f4837b5 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -56,6 +56,20 @@ macro_rules! act_tests { } } + #[test] + fn add_const_prop(alpha in any::<$ti>(), x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[AddConst(alpha)], |x| x + alpha); + } + } + + #[test] + fn mul_const_prop(alpha in any::<$ti>(), x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[MulConst(alpha)], |x| x * alpha); + } + } + #[test] fn max_const_prop(alpha in any::<$ti>(), x in x_strat()) { if $cond { @@ -97,6 +111,17 @@ macro_rules! act_tests { ); } } + + #[test] + fn affine_prop(x in x_strat(), alpha in any::<$ti>(), beta in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::affine(alpha, beta).ops, + |x| x * alpha + beta + ); + } + } } /* prop_act_e2e!($cond, $ti, $ker, affine(alpha, beta)); From c4dc790f036dc5f9fab019f7f3e830c9047d93ef Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 8 May 2023 23:14:34 +0200 Subject: [PATCH 17/25] broken wip --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 136 +++++++++++++++++- linalg/src/frame/activations.rs | 1 + linalg/src/frame/activations/definitions.rs | 14 +- linalg/src/frame/activations/tests.rs | 68 ++++++++- 4 files changed, 214 insertions(+), 5 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 3b85f31668..2309f89450 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -51,9 +51,141 @@ b .ok .move: - b .unsupported + lsr w7, w6, 16 + and w7, w7, 0xff // w7 is dst reg + lsr w6, w6, 24 + and w6, w6, 0xff // w6 is src + add w7, w7, w6, LSL#2 // 4bits DDSS + adr x4, .move_jmp_table + add x4, x4, x7, LSL#2 + br x4 + +.move_jmp_table: + b .inner_loop // a to a + b .move_a_b + b .move_a_c + b .unsupported // a <- d + b .move_b_a + b .inner_loop // b <- b + b .move_b_c + b .unsupported // b <- d + b .move_c_a + b .move_c_b + b .inner_loop // c <- c + b .unsupported // c <- d + b .unsupported // a <- d + b .unsupported // b <- d + b .unsupported // c <- d + b .unsupported // d <- d + +.move_a_b: + and v0.16b, v8.16b, v8.16b + and v1.16b, v9.16b, v9.16b + and v2.16b, v10.16b, v10.16b + and v3.16b, v11.16b, v11.16b + and v4.16b, v12.16b, v12.16b + and v5.16b, v13.16b, v13.16b + and v6.16b, v14.16b, v14.16b + and v7.16b, v15.16b, v15.16b + b .inner_loop + +.move_a_c: + and v0.16b, v16.16b, v16.16b + and v1.16b, v17.16b, v17.16b + and v2.16b, v18.16b, v18.16b + and v3.16b, v19.16b, v19.16b + and v4.16b, v20.16b, v20.16b + and v5.16b, v21.16b, v21.16b + and v6.16b, v22.16b, v22.16b + and v7.16b, v23.16b, v23.16b + b .inner_loop + +.move_b_a: + and v8.16b , v0.16b, v0.16b + and v9.16b , v1.16b, v1.16b + and v10.16b, v2.16b, v2.16b + and v11.16b, v3.16b, v3.16b + and v12.16b, v4.16b, v4.16b + and v13.16b, v5.16b, v5.16b + and v14.16b, v6.16b, v6.16b + and v15.16b, v7.16b, v7.16b + b .inner_loop + +.move_b_c: + and v8.16b , v16.16b, v16.16b + and v9.16b , v17.16b, v17.16b + and v10.16b, v18.16b, v18.16b + and v11.16b, v19.16b, v19.16b + and v12.16b, v20.16b, v20.16b + and v13.16b, v21.16b, v21.16b + and v14.16b, v22.16b, v22.16b + and v15.16b, v23.16b, v23.16b + b .inner_loop + +.move_c_a: + and v16.16b, v0.16b, v0.16b + and v17.16b, v1.16b, v1.16b + and v18.16b, v2.16b, v2.16b + and v19.16b, v3.16b, v3.16b + and v20.16b, v4.16b, v4.16b + and v21.16b, v5.16b, v5.16b + and v22.16b, v6.16b, v6.16b + and v23.16b, v7.16b, v7.16b + b .inner_loop + +.move_c_b: + and v16.16b, v8.16b , v8.16b + and v17.16b, v9.16b , v9.16b + and v18.16b, v10.16b, v10.16b + and v19.16b, v11.16b, v11.16b + and v20.16b, v12.16b, v12.16b + and v21.16b, v13.16b, v13.16b + and v22.16b, v14.16b, v14.16b + and v23.16b, v15.16b, v15.16b + b .inner_loop + .load: - b .unsupported + add x5, x5, 4 + ins v24.s[0], w3 + lsr w7, w6, 16 + and w7, w7, 0xff + adr x4, .load_jmp_table + add x4, x4, x7, LSL#2 + br x4 +.load_jmp_table: + b .load_a + b .load_b + b .load_c +.load_a: + dup v0.4s, v24.s[0] + dup v1.4s, v24.s[0] + dup v2.4s, v24.s[0] + dup v3.4s, v24.s[0] + dup v4.4s, v24.s[0] + dup v5.4s, v24.s[0] + dup v6.4s, v24.s[0] + dup v7.4s, v24.s[0] + b .inner_loop +.load_b: + dup v8.4s, v24.s[0] + dup v9.4s, v24.s[0] + dup v10.4s, v24.s[0] + dup v11.4s, v24.s[0] + dup v12.4s, v24.s[0] + dup v13.4s, v24.s[0] + dup v14.4s, v24.s[0] + dup v15.4s, v24.s[0] + b .inner_loop +.load_c: + dup v16.4s, v24.s[0] + dup v17.4s, v24.s[0] + dup v18.4s, v24.s[0] + dup v19.4s, v24.s[0] + dup v20.4s, v24.s[0] + dup v21.4s, v24.s[0] + dup v22.4s, v24.s[0] + dup v23.4s, v24.s[0] + b .inner_loop .abs: b .unsupported .recip: diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs index 965b8a8890..586f0e29c7 100644 --- a/linalg/src/frame/activations.rs +++ b/linalg/src/frame/activations.rs @@ -10,6 +10,7 @@ use super::element_wise_helper::run_over_slice_with_alignment; pub mod definitions; pub mod reference; #[macro_use] +#[cfg(test)] pub mod tests; #[derive(Clone, Debug, PartialEq)] diff --git a/linalg/src/frame/activations/definitions.rs b/linalg/src/frame/activations/definitions.rs index 40dc7bff84..918c48e705 100644 --- a/linalg/src/frame/activations/definitions.rs +++ b/linalg/src/frame/activations/definitions.rs @@ -41,6 +41,18 @@ pub fn threshold_relu(alpha: T) -> Program { } } +pub fn hard_sigmoid(alpha: T, beta: T) -> Program { + Program { + #[rustfmt::skip] + ops: vec![ + MulConst(alpha), + AddConst(beta), + MinConst(T::one()), + MaxConst(T::zero()), + ], + } +} + pub fn softsign() -> Program { Program { #[rustfmt::skip] @@ -54,7 +66,7 @@ pub fn softsign() -> Program { } } -pub fn hardswish() -> Program { +pub fn hard_swish() -> Program { let one_sixth = T::one() / (T::one() + T::one() + T::one() + T::one() + T::one() + T::one()); let one_half = T::one() / (T::one() + T::one()); Program { diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index 795f4837b5..80c720b901 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -1,7 +1,8 @@ use crate::LADatum; -use super::{ActivationKer, Op, Program}; +use super::{ActivationKer, Op, Program, RegisterId}; use Op::*; +use proptest::prelude::*; pub fn noop() -> Program { Program { ops: vec![] } @@ -28,6 +29,14 @@ pub fn run_kernel_test>( expected.close_enough(&tensor, true).unwrap(); } +impl Arbitrary for RegisterId { + type Parameters = (); + type Strategy = BoxedStrategy; + fn arbitrary_with(_: Self::Parameters) -> Self::Strategy { + proptest::prop_oneof![Just(RegisterId::A), Just(RegisterId::B), Just(RegisterId::C)].boxed() + } +} + #[macro_export] macro_rules! act_tests { ($cond:expr, $ker:ty, $ti:ty) => { @@ -37,7 +46,8 @@ macro_rules! act_tests { use $crate::frame::activations::ActivationKer; use $crate::frame::activations::tests::*; use $crate::frame::activations::Op::*; - use num_traits::Zero; + use $crate::frame::activations::RegisterId; + use num_traits::{Zero, One}; use proptest::prelude::*; use proptest::collection::vec; @@ -56,6 +66,38 @@ macro_rules! act_tests { } } + #[test] + fn load_a_prop(x in x_strat(), konst in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[Load(RegisterId::A, konst)], |_| konst); + } + } + + #[test] + fn load_b_prop(x in x_strat(), konst in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[Load(RegisterId::B, konst)], |x| x); + } + } + + #[test] + fn load_c_prop(x in x_strat(), konst in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[Load(RegisterId::C, konst)], |x| x); + } + } + + #[test] + fn move_b_to_a_prop(x in x_strat(), konst in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &[Load(RegisterId::B, konst), Move(RegisterId::A, RegisterId::B)], + |_| konst + ); + } + } + #[test] fn add_const_prop(alpha in any::<$ti>(), x in x_strat()) { if $cond { @@ -122,6 +164,28 @@ macro_rules! act_tests { ); } } + + #[test] + fn hard_sigmoid(x in x_strat(), alpha in any::<$ti>(), beta in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::hard_sigmoid(alpha, beta).ops, + |x| (x * alpha + beta).min(<$ti>::one()).max(<$ti>::zero()) + ); + } + } + + #[test] + fn hard_swish(x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::hard_swish().ops, + |x| (x * 1./6. + 0.5).min(<$ti>::one()).max(<$ti>::zero()) * x + ); + } + } } /* prop_act_e2e!($cond, $ti, $ker, affine(alpha, beta)); From 2e5d696123e0b1f4da284db4285f32cca088803e Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 15 May 2023 21:12:55 +0200 Subject: [PATCH 18/25] hard sigmoid ok --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 16 +++++++- linalg/src/frame/activations/tests.rs | 38 ++++++++++++++----- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 2309f89450..07097f93e1 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -55,7 +55,7 @@ and w7, w7, 0xff // w7 is dst reg lsr w6, w6, 24 and w6, w6, 0xff // w6 is src - add w7, w7, w6, LSL#2 // 4bits DDSS + add w7, w6, w7, LSL#2 // 4bits DDSS adr x4, .move_jmp_table add x4, x4, x7, LSL#2 br x4 @@ -228,8 +228,20 @@ fmul v6.4s, v6.4s, v24.4s fmul v7.4s, v7.4s, v24.4s b .inner_loop + .min_const: - b .unsupported + ins v24.s[0], w3 + add x5, x5, 4 + dup v24.4s, v24.s[0] + fmin v0.4s, v0.4s, v24.4s + fmin v1.4s, v1.4s, v24.4s + fmin v2.4s, v2.4s, v24.4s + fmin v3.4s, v3.4s, v24.4s + fmin v4.4s, v4.4s, v24.4s + fmin v5.4s, v5.4s, v24.4s + fmin v6.4s, v6.4s, v24.4s + fmin v7.4s, v7.4s, v24.4s + b .inner_loop .max_const: ins v24.s[0], w3 diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index 80c720b901..1f60643459 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -1,8 +1,8 @@ use crate::LADatum; use super::{ActivationKer, Op, Program, RegisterId}; -use Op::*; use proptest::prelude::*; +use Op::*; pub fn noop() -> Program { Program { ops: vec![] } @@ -43,18 +43,18 @@ macro_rules! act_tests { mod acttest { #[allow(unused_imports)] use super::*; - use $crate::frame::activations::ActivationKer; + use num_traits::{One, Zero}; + use proptest::collection::vec; + use proptest::prelude::*; use $crate::frame::activations::tests::*; + use $crate::frame::activations::ActivationKer; use $crate::frame::activations::Op::*; use $crate::frame::activations::RegisterId; - use num_traits::{Zero, One}; - use proptest::prelude::*; - use proptest::collection::vec; fn x_strat() -> impl Strategy> { (1usize..4).prop_flat_map(|repeat| { let size = <$ker>::nr() * repeat; - vec(any::<$ti>(), size..size+1) + vec(any::<$ti>(), size..size + 1) }) } @@ -118,15 +118,22 @@ macro_rules! act_tests { run_kernel_test::<$ti, $ker>(&x, &[MaxConst(alpha)], |x| x.max(alpha)); } } + + #[test] + fn min_const_prop(alpha in any::<$ti>(), x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[MinConst(alpha)], |x| x.min(alpha)); + } + } } #[test] fn max_const_zero() { if $cond { run_kernel_test::<$ti, $ker>( - &vec!(<$ti>::zero(); <$ker>::nr()), + &vec![<$ti>::zero(); <$ker>::nr()], &[MaxConst(<$ti>::zero())], - |x| x.max(<$ti>::zero()) + |x| x.max(<$ti>::zero()), ); } } @@ -135,9 +142,20 @@ macro_rules! act_tests { fn max_const_big_alpha() { if $cond { run_kernel_test::<$ti, $ker>( - &vec!(<$ti>::zero(); <$ker>::nr()), + &vec![<$ti>::zero(); <$ker>::nr()], &[MaxConst(7.567773e37.into())], - |x| x.max(7.567773e37.into()) + |x| x.max(7.567773e37.into()), + ); + } + } + + #[test] + fn move_b_to_a_0() { + if $cond { + run_kernel_test::<$ti, $ker>( + &*vec![<$ti>::zero(); <$ker>::nr()], + &[Load(RegisterId::B, 1.0 as _), Move(RegisterId::A, RegisterId::B)], + |_| 1.0 as _, ); } } From 244bd95b7cadf7cdee20415362e9d2b1a71488d9 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 15 May 2023 21:47:02 +0200 Subject: [PATCH 19/25] hardswish ok --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 28 ++- linalg/src/frame/activations/tests.rs | 169 ++++++++++-------- 2 files changed, 124 insertions(+), 73 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 07097f93e1..0955d07173 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -195,7 +195,15 @@ .sub: b .unsupported .mul: - b .unsupported + fmul v0.4s, v0.4s, v8.4s + fmul v1.4s, v1.4s, v9.4s + fmul v2.4s, v2.4s, v10.4s + fmul v3.4s, v3.4s, v11.4s + fmul v4.4s, v4.4s, v12.4s + fmul v5.4s, v5.4s, v13.4s + fmul v6.4s, v6.4s, v14.4s + fmul v7.4s, v7.4s, v15.4s + b .inner_loop .min: b .unsupported .max: @@ -260,7 +268,23 @@ .fma: b .unsupported .if_pos_then_else: - b .unsupported + fcmge v0.4s, v0.4s, #0.0 + fcmge v1.4s, v1.4s, #0.0 + fcmge v2.4s, v2.4s, #0.0 + fcmge v3.4s, v3.4s, #0.0 + fcmge v4.4s, v4.4s, #0.0 + fcmge v5.4s, v5.4s, #0.0 + fcmge v6.4s, v6.4s, #0.0 + fcmge v7.4s, v7.4s, #0.0 + bsl v0.16b, v8.16b, v16.16b + bsl v1.16b, v9.16b, v17.16b + bsl v2.16b, v10.16b, v18.16b + bsl v3.16b, v11.16b, v19.16b + bsl v4.16b, v12.16b, v20.16b + bsl v5.16b, v13.16b, v21.16b + bsl v6.16b, v14.16b, v22.16b + bsl v7.16b, v15.16b, v23.16b + b .inner_loop .swap_b_c: b .unsupported .floor: diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index 1f60643459..4515581d1c 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -16,10 +16,10 @@ pub fn run_kernel_test>( input: &[TI], prog: &[Op], refer: impl Fn(TI) -> TI, -) { + ) { let mut tensor = tract_data::prelude::Tensor::zero_aligned::(&[input.len()], K::alignment_bytes()) - .unwrap(); + .unwrap(); tensor.as_slice_mut::().unwrap().copy_from_slice(input); let expected: Vec = input.iter().cloned().map(|x| refer(x)).collect(); let expected = tract_data::prelude::tensor1(&expected); @@ -94,7 +94,7 @@ macro_rules! act_tests { &x, &[Load(RegisterId::B, konst), Move(RegisterId::A, RegisterId::B)], |_| konst - ); + ); } } @@ -125,97 +125,124 @@ macro_rules! act_tests { run_kernel_test::<$ti, $ker>(&x, &[MinConst(alpha)], |x| x.min(alpha)); } } - } - - #[test] - fn max_const_zero() { - if $cond { - run_kernel_test::<$ti, $ker>( - &vec![<$ti>::zero(); <$ker>::nr()], - &[MaxConst(<$ti>::zero())], - |x| x.max(<$ti>::zero()), - ); - } - } - #[test] - fn max_const_big_alpha() { - if $cond { - run_kernel_test::<$ti, $ker>( - &vec![<$ti>::zero(); <$ker>::nr()], - &[MaxConst(7.567773e37.into())], - |x| x.max(7.567773e37.into()), - ); - } - } - - #[test] - fn move_b_to_a_0() { - if $cond { - run_kernel_test::<$ti, $ker>( - &*vec![<$ti>::zero(); <$ker>::nr()], - &[Load(RegisterId::B, 1.0 as _), Move(RegisterId::A, RegisterId::B)], - |_| 1.0 as _, - ); + #[test] + fn mul_prop(x in x_strat(), v in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[Load(RegisterId::B, v), Mul], |x| x * v); + } } - } - proptest::proptest! { #[test] - fn relu_prop(x in x_strat()) { + fn ifposte_prop(x in x_strat()) { if $cond { - run_kernel_test::<$ti, $ker>( - &x, - &$crate::frame::activations::definitions::relu().ops, - |x| x.max(<$ti>::zero()) - ); + run_kernel_test::<$ti, $ker>(&x, + &[Load(RegisterId::B, 2 as _), Load(RegisterId::C, 3 as _), IfPosTE], + |x| if x >= <$ti>::zero() { 2 as _ } else { 3 as _ }); + } } } #[test] - fn affine_prop(x in x_strat(), alpha in any::<$ti>(), beta in any::<$ti>()) { + fn max_const_zero() { if $cond { run_kernel_test::<$ti, $ker>( - &x, - &$crate::frame::activations::definitions::affine(alpha, beta).ops, - |x| x * alpha + beta - ); + &vec![<$ti>::zero(); <$ker>::nr()], + &[MaxConst(<$ti>::zero())], + |x| x.max(<$ti>::zero()), + ); } } #[test] - fn hard_sigmoid(x in x_strat(), alpha in any::<$ti>(), beta in any::<$ti>()) { + fn max_const_big_alpha() { if $cond { run_kernel_test::<$ti, $ker>( - &x, - &$crate::frame::activations::definitions::hard_sigmoid(alpha, beta).ops, - |x| (x * alpha + beta).min(<$ti>::one()).max(<$ti>::zero()) - ); + &vec![<$ti>::zero(); <$ker>::nr()], + &[MaxConst(7.567773e37.into())], + |x| x.max(7.567773e37.into()), + ); } } #[test] - fn hard_swish(x in x_strat()) { + fn move_b_to_a_0() { if $cond { run_kernel_test::<$ti, $ker>( - &x, - &$crate::frame::activations::definitions::hard_swish().ops, - |x| (x * 1./6. + 0.5).min(<$ti>::one()).max(<$ti>::zero()) * x - ); + &*vec![<$ti>::zero(); <$ker>::nr()], + &[Load(RegisterId::B, 1.0 as _), Move(RegisterId::A, RegisterId::B)], + |_| 1.0 as _, + ); } } + + proptest::proptest! { + #[test] + fn relu_prop(x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::relu().ops, + |x| x.max(<$ti>::zero()) + ); + } + } + + #[test] + fn affine_prop(x in x_strat(), alpha in any::<$ti>(), beta in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::affine(alpha, beta).ops, + |x| x * alpha + beta + ); + } + } + + #[test] + fn leaky_relu_prop(x in x_strat(), alpha in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::leaky_relu(alpha).ops, + |x| if x >= <$ti>::zero() { x } else { alpha * x } + ); + } + } + + #[test] + fn hard_sigmoid(x in x_strat(), alpha in any::<$ti>(), beta in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::hard_sigmoid(alpha, beta).ops, + |x| (x * alpha + beta).min(<$ti>::one()).max(<$ti>::zero()) + ); + } + } + + #[test] + fn hard_swish(x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::hard_swish().ops, + |x| (x * 1./6. + 0.5).min(<$ti>::one()).max(<$ti>::zero()) * x + ); + } + } + } + /* + prop_act_e2e!($cond, $ti, $ker, affine(alpha, beta)); + prop_act_e2e!($cond, $ti, $ker, leaky_relu(alpha)); + prop_act_e2e!($cond, $ti, $ker, threshold_relu(alpha)); + prop_act_e2e!($cond, $ti, $ker, softsign()); + prop_act_e2e!($cond, $ti, $ker, hardswish()); + /* + prop_activation!($cond, $ti, $ker, sigmoid()); + prop_activation!($cond, $ti, $ker, exp2f()); + */ + */ } - /* - prop_act_e2e!($cond, $ti, $ker, affine(alpha, beta)); - prop_act_e2e!($cond, $ti, $ker, leaky_relu(alpha)); - prop_act_e2e!($cond, $ti, $ker, threshold_relu(alpha)); - prop_act_e2e!($cond, $ti, $ker, softsign()); - prop_act_e2e!($cond, $ti, $ker, hardswish()); - /* - prop_activation!($cond, $ti, $ker, sigmoid()); - prop_activation!($cond, $ti, $ker, exp2f()); - */ - */ - } - }; -} + }; + } From 904c65250879e43b8b06e9bdf6909eb74c6f874b Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 15 May 2023 22:07:26 +0200 Subject: [PATCH 20/25] threshold, softsign --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 36 +++++++++++-- linalg/src/frame/activations/tests.rs | 54 +++++++++++++++---- 2 files changed, 76 insertions(+), 14 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 0955d07173..815aee0283 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -187,9 +187,26 @@ dup v23.4s, v24.s[0] b .inner_loop .abs: - b .unsupported + fabs v0.4s, v0.4s + fabs v1.4s, v1.4s + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + fabs v5.4s, v5.4s + fabs v6.4s, v6.4s + fabs v7.4s, v7.4s + b .inner_loop .recip: - b .unsupported + fmov v24.4s, #1.0 + fdiv v0.4s, v24.4s, v0.4s + fdiv v1.4s, v24.4s, v1.4s + fdiv v2.4s, v24.4s, v2.4s + fdiv v3.4s, v24.4s, v3.4s + fdiv v4.4s, v24.4s, v4.4s + fdiv v5.4s, v24.4s, v5.4s + fdiv v6.4s, v24.4s, v6.4s + fdiv v7.4s, v24.4s, v7.4s + b .inner_loop .add: b .unsupported .sub: @@ -221,8 +238,21 @@ fadd v6.4s, v6.4s, v24.4s fadd v7.4s, v7.4s, v24.4s b .inner_loop + .sub_const: - b .unsupported + ins v24.s[0], w3 + add x5, x5, 4 + dup v24.4s, v24.s[0] + fsub v0.4s, v0.4s, v24.4s + fsub v1.4s, v1.4s, v24.4s + fsub v2.4s, v2.4s, v24.4s + fsub v3.4s, v3.4s, v24.4s + fsub v4.4s, v4.4s, v24.4s + fsub v5.4s, v5.4s, v24.4s + fsub v6.4s, v6.4s, v24.4s + fsub v7.4s, v7.4s, v24.4s + b .inner_loop + .mul_const: ins v24.s[0], w3 add x5, x5, 4 diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index 4515581d1c..97fe1e1bd7 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -98,6 +98,20 @@ macro_rules! act_tests { } } + #[test] + fn abs_prop(x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[Abs], |x| x.abs()); + } + } + + #[test] + fn recip_prop(x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[Recip], |x| x.recip()); + } + } + #[test] fn add_const_prop(alpha in any::<$ti>(), x in x_strat()) { if $cond { @@ -105,6 +119,13 @@ macro_rules! act_tests { } } + #[test] + fn sub_const_prop(alpha in any::<$ti>(), x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[SubConst(alpha)], |x| x - alpha); + } + } + #[test] fn mul_const_prop(alpha in any::<$ti>(), x in x_strat()) { if $cond { @@ -210,6 +231,17 @@ macro_rules! act_tests { } } + #[test] + fn threshold_relu_prop(x in x_strat(), alpha in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::threshold_relu(alpha).ops, + |x| if x >= alpha { x } else { <$ti>::zero() } + ); + } + } + #[test] fn hard_sigmoid(x in x_strat(), alpha in any::<$ti>(), beta in any::<$ti>()) { if $cond { @@ -221,6 +253,17 @@ macro_rules! act_tests { } } + #[test] + fn softsign(x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::softsign().ops, + |x| x / ( <$ti>::one() + x.abs()) + ); + } + } + #[test] fn hard_swish(x in x_strat()) { if $cond { @@ -232,17 +275,6 @@ macro_rules! act_tests { } } } - /* - prop_act_e2e!($cond, $ti, $ker, affine(alpha, beta)); - prop_act_e2e!($cond, $ti, $ker, leaky_relu(alpha)); - prop_act_e2e!($cond, $ti, $ker, threshold_relu(alpha)); - prop_act_e2e!($cond, $ti, $ker, softsign()); - prop_act_e2e!($cond, $ti, $ker, hardswish()); - /* - prop_activation!($cond, $ti, $ker, sigmoid()); - prop_activation!($cond, $ti, $ker, exp2f()); - */ - */ } }; } From 5fde35fafd7c4e78836f8852900c34c0e5977310 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 15 May 2023 22:12:27 +0200 Subject: [PATCH 21/25] missing ops --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 56 +++++++++++++++++-- linalg/src/frame/activations/tests.rs | 29 ++++++++++ 2 files changed, 81 insertions(+), 4 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 815aee0283..fcd04311e6 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -152,10 +152,12 @@ adr x4, .load_jmp_table add x4, x4, x7, LSL#2 br x4 + .load_jmp_table: b .load_a b .load_b b .load_c + .load_a: dup v0.4s, v24.s[0] dup v1.4s, v24.s[0] @@ -166,6 +168,7 @@ dup v6.4s, v24.s[0] dup v7.4s, v24.s[0] b .inner_loop + .load_b: dup v8.4s, v24.s[0] dup v9.4s, v24.s[0] @@ -176,6 +179,7 @@ dup v14.4s, v24.s[0] dup v15.4s, v24.s[0] b .inner_loop + .load_c: dup v16.4s, v24.s[0] dup v17.4s, v24.s[0] @@ -186,6 +190,7 @@ dup v22.4s, v24.s[0] dup v23.4s, v24.s[0] b .inner_loop + .abs: fabs v0.4s, v0.4s fabs v1.4s, v1.4s @@ -196,6 +201,7 @@ fabs v6.4s, v6.4s fabs v7.4s, v7.4s b .inner_loop + .recip: fmov v24.4s, #1.0 fdiv v0.4s, v24.4s, v0.4s @@ -207,10 +213,29 @@ fdiv v6.4s, v24.4s, v6.4s fdiv v7.4s, v24.4s, v7.4s b .inner_loop + .add: - b .unsupported + fadd v0.4s, v0.4s, v8.4s + fadd v1.4s, v1.4s, v9.4s + fadd v2.4s, v2.4s, v10.4s + fadd v3.4s, v3.4s, v11.4s + fadd v4.4s, v4.4s, v12.4s + fadd v5.4s, v5.4s, v13.4s + fadd v6.4s, v6.4s, v14.4s + fadd v7.4s, v7.4s, v15.4s + b .inner_loop + .sub: - b .unsupported + fsub v0.4s, v0.4s, v8.4s + fsub v1.4s, v1.4s, v9.4s + fsub v2.4s, v2.4s, v10.4s + fsub v3.4s, v3.4s, v11.4s + fsub v4.4s, v4.4s, v12.4s + fsub v5.4s, v5.4s, v13.4s + fsub v6.4s, v6.4s, v14.4s + fsub v7.4s, v7.4s, v15.4s + b .inner_loop + .mul: fmul v0.4s, v0.4s, v8.4s fmul v1.4s, v1.4s, v9.4s @@ -221,10 +246,29 @@ fmul v6.4s, v6.4s, v14.4s fmul v7.4s, v7.4s, v15.4s b .inner_loop + .min: - b .unsupported + fmin v0.4s, v0.4s, v8.4s + fmin v1.4s, v1.4s, v9.4s + fmin v2.4s, v2.4s, v10.4s + fmin v3.4s, v3.4s, v11.4s + fmin v4.4s, v4.4s, v12.4s + fmin v5.4s, v5.4s, v13.4s + fmin v6.4s, v6.4s, v14.4s + fmin v7.4s, v7.4s, v15.4s + b .inner_loop + .max: - b .unsupported + fmax v0.4s, v0.4s, v8.4s + fmax v1.4s, v1.4s, v9.4s + fmax v2.4s, v2.4s, v10.4s + fmax v3.4s, v3.4s, v11.4s + fmax v4.4s, v4.4s, v12.4s + fmax v5.4s, v5.4s, v13.4s + fmax v6.4s, v6.4s, v14.4s + fmax v7.4s, v7.4s, v15.4s + b .inner_loop + .add_const: ins v24.s[0], w3 add x5, x5, 4 @@ -297,6 +341,7 @@ .fma: b .unsupported + .if_pos_then_else: fcmge v0.4s, v0.4s, #0.0 fcmge v1.4s, v1.4s, #0.0 @@ -315,10 +360,13 @@ bsl v6.16b, v14.16b, v22.16b bsl v7.16b, v15.16b, v23.16b b .inner_loop + .swap_b_c: b .unsupported + .floor: b .unsupported + .two_pow_of_int: b .unsupported diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index 97fe1e1bd7..f4331d6e55 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -147,6 +147,20 @@ macro_rules! act_tests { } } + #[test] + fn add_prop(x in x_strat(), v in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[Load(RegisterId::B, v), Add], |x| x + v); + } + } + + #[test] + fn sub_prop(x in x_strat(), v in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[Load(RegisterId::B, v), Sub], |x| x - v); + } + } + #[test] fn mul_prop(x in x_strat(), v in any::<$ti>()) { if $cond { @@ -154,6 +168,21 @@ macro_rules! act_tests { } } + #[test] + fn min_prop(x in x_strat(), v in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[Load(RegisterId::B, v), Min], |x| x.min(v)); + } + } + + #[test] + fn max_prop(x in x_strat(), v in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, &[Load(RegisterId::B, v), Max], |x| x.max(v)); + } + } + + #[test] fn ifposte_prop(x in x_strat()) { if $cond { From be9d8cce5f30562ce54184a8ed5e8feb4fbe58e6 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 22 May 2023 21:23:06 +0200 Subject: [PATCH 22/25] benches. --- linalg/Cargo.toml | 5 ++++ linalg/benches/activations.rs | 48 +++++++++++++++++++++++++++++++++ linalg/src/frame/activations.rs | 5 ++++ 3 files changed, 58 insertions(+) create mode 100644 linalg/benches/activations.rs diff --git a/linalg/Cargo.toml b/linalg/Cargo.toml index f575560b1c..16ac5d2083 100644 --- a/linalg/Cargo.toml +++ b/linalg/Cargo.toml @@ -97,5 +97,10 @@ name = "x86_64" harness = false [[bench]] +bench = false name = "intel" harness = false + +[[bench]] +name = "activations" +harness = false diff --git a/linalg/benches/activations.rs b/linalg/benches/activations.rs new file mode 100644 index 0000000000..e76fb2672a --- /dev/null +++ b/linalg/benches/activations.rs @@ -0,0 +1,48 @@ +use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion}; +use tract_linalg::frame::activations::{definitions, reference, ActivationKer, Program}; + +fn crit(c: &mut Criterion, name: &str, r: impl Fn(f32) -> f32, prog: &Program) { + let mut group = c.benchmark_group(name); + for size in [1i32, 32, 256, 1024, 8192].iter() { + group.throughput(criterion::Throughput::Elements(*size as u64)); + group.bench_with_input(BenchmarkId::new("Reference", size), size, |b, size| { + b.iter_batched( + || vec![1.0f32; *size as usize], + |v| { + for x in v { + r(black_box(x)); + } + }, + BatchSize::LargeInput, + ) + }); + #[allow(unused_mut)] + let mut vms = vec!(tract_linalg::generic::activations::SActivations::act()); + #[cfg(target_arch="aarch64")] + { + vms.push(tract_linalg::arm64::arm64simd_act_f32_32n::act()); + } + + for vm in vms { + group.bench_with_input(BenchmarkId::new(vm.name(), size), size, |b, size| { + b.iter_batched( + || vec![1.0f32; *size as usize], + |mut v| vm.run(prog, &mut v), + BatchSize::LargeInput, + ) + }); + } + } +} + +fn criterion_benchmark(c: &mut Criterion) { + crit(c, "relu", reference::relu, &definitions::relu()); + crit(c, "hardswish", reference::hardswish, &definitions::hard_swish()); + /* + crit(c, "exp2f", reference::exp2f, &definitions::exp2f()); + crit(c, "sigmoid", reference::sigmoid, &definitions::sigmoid()); + */ +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs index 586f0e29c7..48a30a5f17 100644 --- a/linalg/src/frame/activations.rs +++ b/linalg/src/frame/activations.rs @@ -142,6 +142,7 @@ pub union OpOrConst { } pub trait Activation: Send + Sync + Debug + dyn_clone::DynClone { + fn name(&self) -> &'static str; fn run(&self, prog: &Program, vec: &mut [T]) -> TractResult<()>; } @@ -159,6 +160,10 @@ where T: LADatum, K: ActivationKer + Clone, { + fn name(&self) -> &'static str { + K::name() + } + fn run(&self, program: &Program, vec: &mut [T]) -> TractResult<()> { let ker_program = program.translate(); run_over_slice_with_alignment( From de1829dac3ebc4a876602419433b91b0d1df7b4e Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 22 May 2023 22:19:19 +0200 Subject: [PATCH 23/25] sigmoid --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 66 +++++- linalg/benches/activations.rs | 31 ++- linalg/src/frame/activations/definitions.rs | 224 ++++++++---------- linalg/src/frame/activations/tests.rs | 37 ++- 4 files changed, 222 insertions(+), 136 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index fcd04311e6..d82df4bee3 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -339,8 +339,41 @@ fmax v7.4s, v7.4s, v24.4s b .inner_loop -.fma: - b .unsupported +.fma: + // a <- a * b + k + // vfma a,b,c does a <- a + b * c + // mov d,a ; mov a,#k ; vfma a, b, d + + and v24.16b, v0.16b, v0.16b + and v25.16b, v1.16b, v1.16b + and v26.16b, v2.16b, v2.16b + and v27.16b, v3.16b, v3.16b + and v28.16b, v4.16b, v4.16b + and v29.16b, v5.16b, v5.16b + and v30.16b, v6.16b, v6.16b + and v31.16b, v7.16b, v7.16b + + ins v0.s[0], w3 + add x5, x5, 4 + dup v0.4s, v0.s[0] + dup v1.4s, v0.s[0] + dup v2.4s, v0.s[0] + dup v3.4s, v0.s[0] + dup v4.4s, v0.s[0] + dup v5.4s, v0.s[0] + dup v6.4s, v0.s[0] + dup v7.4s, v0.s[0] + + fmla v0.4s, v24.4s, v8.4s + fmla v1.4s, v25.4s, v9.4s + fmla v2.4s, v26.4s, v10.4s + fmla v3.4s, v27.4s, v11.4s + fmla v4.4s, v28.4s, v12.4s + fmla v5.4s, v29.4s, v13.4s + fmla v6.4s, v30.4s, v14.4s + fmla v7.4s, v31.4s, v15.4s + + b .inner_loop .if_pos_then_else: fcmge v0.4s, v0.4s, #0.0 @@ -362,7 +395,34 @@ b .inner_loop .swap_b_c: - b .unsupported +// move d <- b + and v24.16b, v8.16b , v8.16b + and v25.16b, v9.16b , v9.16b + and v26.16b, v10.16b, v10.16b + and v27.16b, v11.16b, v11.16b + and v28.16b, v12.16b, v12.16b + and v29.16b, v13.16b, v13.16b + and v30.16b, v14.16b, v14.16b + and v31.16b, v15.16b, v15.16b +// move b <- c + and v8.16b , v16.16b, v16.16b + and v9.16b , v17.16b, v17.16b + and v10.16b, v18.16b, v18.16b + and v11.16b, v19.16b, v19.16b + and v12.16b, v20.16b, v20.16b + and v13.16b, v21.16b, v21.16b + and v14.16b, v22.16b, v22.16b + and v15.16b, v23.16b, v23.16b +// move c <- d + and v16.16b, v24.16b, v24.16b + and v17.16b, v25.16b, v25.16b + and v18.16b, v26.16b, v26.16b + and v19.16b, v27.16b, v27.16b + and v20.16b, v28.16b, v28.16b + and v21.16b, v29.16b, v29.16b + and v22.16b, v30.16b, v30.16b + and v23.16b, v31.16b, v31.16b + b .inner_loop .floor: b .unsupported diff --git a/linalg/benches/activations.rs b/linalg/benches/activations.rs index e76fb2672a..9869aea59b 100644 --- a/linalg/benches/activations.rs +++ b/linalg/benches/activations.rs @@ -1,9 +1,11 @@ use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion}; use tract_linalg::frame::activations::{definitions, reference, ActivationKer, Program}; +const SIZES:&[i32] = &[32, 256, 1024, 8192]; + fn crit(c: &mut Criterion, name: &str, r: impl Fn(f32) -> f32, prog: &Program) { let mut group = c.benchmark_group(name); - for size in [1i32, 32, 256, 1024, 8192].iter() { + for size in SIZES { group.throughput(criterion::Throughput::Elements(*size as u64)); group.bench_with_input(BenchmarkId::new("Reference", size), size, |b, size| { b.iter_batched( @@ -14,7 +16,7 @@ fn crit(c: &mut Criterion, name: &str, r: impl Fn(f32) -> f32, prog: &Program f32, prog: &Program f32, prog: &Program() -> Program { pub fn affine(alpha: T, beta: T) -> Program { Program { #[rustfmt::skip] - ops: vec![ - MulConst(alpha), - AddConst(beta), - ], + ops: vec![ + MulConst(alpha), + AddConst(beta), + ], } } pub fn leaky_relu(alpha: T) -> Program { Program { #[rustfmt::skip] - ops: vec![ - Move(B,A), - MulConst(alpha), - Move(C,A), - Move(A,B), - IfPosTE, - ], + ops: vec![ + Move(B,A), + MulConst(alpha), + Move(C,A), + Move(A,B), + IfPosTE, + ], } } pub fn threshold_relu(alpha: T) -> Program { Program { #[rustfmt::skip] - ops: vec![ - Move(B,A), - SubConst(alpha), - Load(C, T::zero()), - IfPosTE, - ], + ops: vec![ + Move(B,A), + SubConst(alpha), + Load(C, T::zero()), + IfPosTE, + ], } } pub fn hard_sigmoid(alpha: T, beta: T) -> Program { Program { #[rustfmt::skip] - ops: vec![ - MulConst(alpha), - AddConst(beta), - MinConst(T::one()), - MaxConst(T::zero()), - ], + ops: vec![ + MulConst(alpha), + AddConst(beta), + MinConst(T::one()), + MaxConst(T::zero()), + ], } } pub fn softsign() -> Program { Program { #[rustfmt::skip] - ops: vec![ - Move(B,A), - Abs, - AddConst(T::one()), - Recip, - Mul, - ], + ops: vec![ + Move(B,A), + Abs, + AddConst(T::one()), + Recip, + Mul, + ], } } @@ -71,104 +71,88 @@ pub fn hard_swish() -> Program { let one_half = T::one() / (T::one() + T::one()); Program { #[rustfmt::skip] - ops: vec![ - Move(B, A), - MulConst(one_sixth), - AddConst(one_half), - MinConst(T::one()), - MaxConst(T::zero()), - Mul, - ], + ops: vec![ + Move(B, A), + MulConst(one_sixth), + AddConst(one_half), + MinConst(T::one()), + MaxConst(T::zero()), + Mul, + ], } } -/* -pub fn sigmoid() -> Program { +pub fn sigmoid() -> Program { Program { - #[rustfmt::skip] - ops: vec![ - MinConst(3), - MaxConst(2), - Move(B, A), // b = x - Move(C, A), // c = x - Mul, // a = x2 - Move(B, A), // b = x2 - MulConst(4), - AddConst(5), // a = x2 * a13 + a11 - FMA(6), - FMA(7), - FMA(8), - FMA(9), - FMA(10), - SwapBC, // c = x2, b = x - Mul, // a = p(x) - Move(B, C), // b = x2 - Move(C, A), // c = p(x) - Move(A, B), // a = x2 - MulConst(11), - AddConst(12), - FMA(13), - FMA(1), // a = q(x) - Recip, - Move(B,C), // b = p(x) - Mul, - AddConst(14) - ], - csts: vec![ - -18.6, // const 2 - 18.6, // const 3 - -4.433153405e-18, // const 4, also alpha_13 - 1.169974371e-14, // const 5, also a11 - -1.875289645e-11, - 4.257889523e-8, - 0.00004811817576, // const 8 - 0.008163842030, - 0.2499999971, // alpha_1 - 3.922935744e-6, // beta_6 - 0.001524872358, // const 12 - 0.1159886749, - 0.5, //beta_0 + ops: vec![ + MaxConst(-18.6), // const 2 + MinConst(18.6), // const 3 + Move(B, A), // b = x + Move(C, A), // c = x + Mul, // a = x2 + Move(B, A), // b = x2 + MulConst(-4.433153405e-18), // const 4, also alpha_13 + AddConst(1.169974371e-14), // const 5, also a11 + FMA(-1.875289645e-11), + FMA(4.257889523e-8), + FMA(0.00004811817576), // const 8 + FMA(0.008163842030), + FMA(0.2499999971), // alpha_1 + SwapBC, // c = x2, b = x + Mul, // a = p(x) + Move(B, C), // b = x2 + Move(C, A), // c = p(x) + Move(A, B), // a = x2 + MulConst(3.922935744e-6), // beta_6 + AddConst(0.001524872358), // const 12 + FMA(0.1159886749), + FMA(1.0), // a = q(x) + Recip, + Move(B, C), // b = p(x) + Mul, + AddConst(0.5), //beta_0 ], } } +/* pub fn exp2f() -> Program { - Program { - #[rustfmt::skip] - ops: vec![ - MinConst(2), - MaxConst(3), - Move(B, A), // b = x - AddConst(4), // a = x + 0.5 - Floor, // a = ipart - Move(C, A), // c = ipart - Move(A, B), // a = x - Move(B, C), // b = ipart - Sub, // a = fpart - Move(B, A), // b = fpart - Load(A, 5), // a = exp2p[0] - FMA(6), - FMA(7), - FMA(8), - FMA(9), - FMA(10), - FMA(1), // a = y - Move(B, A), - Move(A, C), - TwoPowOfInt, - Mul - ], - csts: vec![ - 127f32, - -127f32, - 0.5, - 1.535336188319500e-4, - 1.339887440266574e-3, - 9.618437357674640e-3, - 5.550332471162809e-2, - 2.402264791363012e-1, - 6.931472028550421e-1, - ], - } +Program { +#[rustfmt::skip] +ops: vec![ +MinConst(2), +MaxConst(3), +Move(B, A), // b = x +AddConst(4), // a = x + 0.5 +Floor, // a = ipart +Move(C, A), // c = ipart +Move(A, B), // a = x +Move(B, C), // b = ipart +Sub, // a = fpart +Move(B, A), // b = fpart +Load(A, 5), // a = exp2p[0] +FMA(6), +FMA(7), +FMA(8), +FMA(9), +FMA(10), +FMA(1), // a = y +Move(B, A), +Move(A, C), +TwoPowOfInt, +Mul +], +csts: vec![ +127f32, +-127f32, +0.5, +1.535336188319500e-4, +1.339887440266574e-3, +9.618437357674640e-3, +5.550332471162809e-2, +2.402264791363012e-1, +6.931472028550421e-1, +], +} } */ diff --git a/linalg/src/frame/activations/tests.rs b/linalg/src/frame/activations/tests.rs index f4331d6e55..2987a0f774 100644 --- a/linalg/src/frame/activations/tests.rs +++ b/linalg/src/frame/activations/tests.rs @@ -189,10 +189,28 @@ macro_rules! act_tests { run_kernel_test::<$ti, $ker>(&x, &[Load(RegisterId::B, 2 as _), Load(RegisterId::C, 3 as _), IfPosTE], |x| if x >= <$ti>::zero() { 2 as _ } else { 3 as _ }); - } } } + #[test] + fn swapbc_prop(x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, + &[Load(RegisterId::B, 2 as _), Load(RegisterId::C, 3 as _), SwapBC, IfPosTE], + |x| if x >= <$ti>::zero() { 3 as _ } else { 2 as _ }); + } + } + + #[test] + fn fma_prop(x in x_strat(), b in any::<$ti>(), k in any::<$ti>()) { + if $cond { + run_kernel_test::<$ti, $ker>(&x, + &[Load(RegisterId::B, b), FMA(k)], + |x| x * b + k); + } + } + } + #[test] fn max_const_zero() { if $cond { @@ -272,7 +290,7 @@ macro_rules! act_tests { } #[test] - fn hard_sigmoid(x in x_strat(), alpha in any::<$ti>(), beta in any::<$ti>()) { + fn hard_sigmoid_prop(x in x_strat(), alpha in any::<$ti>(), beta in any::<$ti>()) { if $cond { run_kernel_test::<$ti, $ker>( &x, @@ -283,7 +301,7 @@ macro_rules! act_tests { } #[test] - fn softsign(x in x_strat()) { + fn softsign_prop(x in x_strat()) { if $cond { run_kernel_test::<$ti, $ker>( &x, @@ -294,7 +312,7 @@ macro_rules! act_tests { } #[test] - fn hard_swish(x in x_strat()) { + fn hard_swish_prop(x in x_strat()) { if $cond { run_kernel_test::<$ti, $ker>( &x, @@ -303,6 +321,17 @@ macro_rules! act_tests { ); } } + + #[test] + fn sigmoid_prop(x in x_strat()) { + if $cond { + run_kernel_test::<$ti, $ker>( + &x, + &$crate::frame::activations::definitions::sigmoid().ops, + crate::generic::sigmoid::ssigmoid + ); + } + } } } }; From 9ee3fe0ad66bd4c60654f9e849b302a7aa28df20 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 5 Jun 2023 21:18:04 +0200 Subject: [PATCH 24/25] replace fmla by mul and add in FMA --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 54 ++++++++----------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index d82df4bee3..8887dba744 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -339,39 +339,29 @@ fmax v7.4s, v7.4s, v24.4s b .inner_loop -.fma: - // a <- a * b + k - // vfma a,b,c does a <- a + b * c - // mov d,a ; mov a,#k ; vfma a, b, d - - and v24.16b, v0.16b, v0.16b - and v25.16b, v1.16b, v1.16b - and v26.16b, v2.16b, v2.16b - and v27.16b, v3.16b, v3.16b - and v28.16b, v4.16b, v4.16b - and v29.16b, v5.16b, v5.16b - and v30.16b, v6.16b, v6.16b - and v31.16b, v7.16b, v7.16b - - ins v0.s[0], w3 +.fma: + // a <- a * b + fmul v0.4s, v0.4s, v8.4s + fmul v1.4s, v1.4s, v9.4s + fmul v2.4s, v2.4s, v10.4s + fmul v3.4s, v3.4s, v11.4s + fmul v4.4s, v4.4s, v12.4s + fmul v5.4s, v5.4s, v13.4s + fmul v6.4s, v6.4s, v14.4s + fmul v7.4s, v7.4s, v15.4s + + // a <- a + k + ins v24.s[0], w3 add x5, x5, 4 - dup v0.4s, v0.s[0] - dup v1.4s, v0.s[0] - dup v2.4s, v0.s[0] - dup v3.4s, v0.s[0] - dup v4.4s, v0.s[0] - dup v5.4s, v0.s[0] - dup v6.4s, v0.s[0] - dup v7.4s, v0.s[0] - - fmla v0.4s, v24.4s, v8.4s - fmla v1.4s, v25.4s, v9.4s - fmla v2.4s, v26.4s, v10.4s - fmla v3.4s, v27.4s, v11.4s - fmla v4.4s, v28.4s, v12.4s - fmla v5.4s, v29.4s, v13.4s - fmla v6.4s, v30.4s, v14.4s - fmla v7.4s, v31.4s, v15.4s + dup v24.4s, v24.s[0] + fadd v0.4s, v0.4s, v24.4s + fadd v1.4s, v1.4s, v24.4s + fadd v2.4s, v2.4s, v24.4s + fadd v3.4s, v3.4s, v24.4s + fadd v4.4s, v4.4s, v24.4s + fadd v5.4s, v5.4s, v24.4s + fadd v6.4s, v6.4s, v24.4s + fadd v7.4s, v7.4s, v24.4s b .inner_loop From dd9f002b11335cc379379fa75d823d93c85fceb7 Mon Sep 17 00:00:00 2001 From: Mathieu Poumeyrol Date: Mon, 19 Jun 2023 20:11:29 +0200 Subject: [PATCH 25/25] wip for benches (noop micro o) --- .../arm64simd/arm64simd_act_f32_32n.tmpl | 3 + linalg/benches/activations.rs | 78 +++++++++++++------ linalg/src/frame/activations.rs | 3 + linalg/src/generic/activations.rs | 1 + 4 files changed, 62 insertions(+), 23 deletions(-) diff --git a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl index 8887dba744..977c5c8541 100644 --- a/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl +++ b/linalg/arm64/arm64simd/arm64simd_act_f32_32n.tmpl @@ -420,6 +420,9 @@ .two_pow_of_int: b .unsupported +.noop: + b .inner_loop + .unsupported: mov x0, 1 b .return diff --git a/linalg/benches/activations.rs b/linalg/benches/activations.rs index 9869aea59b..7a18f77435 100644 --- a/linalg/benches/activations.rs +++ b/linalg/benches/activations.rs @@ -1,9 +1,21 @@ use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion}; -use tract_linalg::frame::activations::{definitions, reference, ActivationKer, Program}; +use tract_linalg::frame::activations::{ + definitions, reference, Activation, ActivationKer, Op, Program, +}; -const SIZES:&[i32] = &[32, 256, 1024, 8192]; +const SIZES: &[i32] = &[32, 256, 1024, 8192]; -fn crit(c: &mut Criterion, name: &str, r: impl Fn(f32) -> f32, prog: &Program) { +fn vms() -> Vec>> { + #[allow(unused_mut)] + let mut vms = vec![tract_linalg::generic::activations::SActivations::act()]; + #[cfg(target_arch = "aarch64")] + { + vms.push(tract_linalg::arm64::arm64simd_act_f32_32n::act()); + } + vms +} + +fn e2e(c: &mut Criterion, name: &str, r: impl Fn(f32) -> f32, prog: &Program) { let mut group = c.benchmark_group(name); for size in SIZES { group.throughput(criterion::Throughput::Elements(*size as u64)); @@ -16,22 +28,15 @@ fn crit(c: &mut Criterion, name: &str, r: impl Fn(f32) -> f32, prog: &Program f32, prog: &Program) { + let mut group = c.benchmark_group(format!("unit/{name}")); + for size in SIZES { + group.throughput(criterion::Throughput::Elements(*size as u64)); + for vm in vms() { + group.bench_with_input( + BenchmarkId::new(vm.name(), size), + size, + |b, size| { + b.iter_batched( + || vec![1.0f32; *size as usize], + |mut v| vm.run(prog, &mut v), + BatchSize::LargeInput, + ) + }, + ); + } } +} + +fn unit_benchmarks(c: &mut Criterion) { + unit(c, "empty", &Program { ops: vec![] }); + unit(c, "noop", &Program { ops: vec![Op::Noop] }); + unit(c, "noop2", &Program { ops: vec![Op::Noop, Op::Noop] }); + unit(c, "noop100", &Program { ops: vec![Op::Noop; 100] }); + unit(c, "add100", &Program { ops: vec![Op::AddConst(1.0); 100] }); + unit(c, "sub100", &Program { ops: vec![Op::SubConst(1.0); 100] }); + unit(c, "mul100", &Program { ops: vec![Op::MulConst(1.0); 100] }); + unit(c, "fma100", &Program { ops: vec![Op::FMA(1.0); 100] }); +} - criterion_group!(benches, criterion_benchmark); - criterion_main!(benches); +criterion_group!(benches, /* e2e_benchmarks, */ unit_benchmarks); +criterion_main!(benches); diff --git a/linalg/src/frame/activations.rs b/linalg/src/frame/activations.rs index 48a30a5f17..ede87b102b 100644 --- a/linalg/src/frame/activations.rs +++ b/linalg/src/frame/activations.rs @@ -47,6 +47,7 @@ pub enum Op { SwapBC, Floor, TwoPowOfInt, + Noop, } impl Program { @@ -94,6 +95,7 @@ impl Program { Op::SwapBC => ops.push(OpOrConst { op: KerOp::SwapBC }), Op::Floor => ops.push(OpOrConst { op: KerOp::Floor }), Op::TwoPowOfInt => ops.push(OpOrConst { op: KerOp::TwoPowOfInt }), + Op::Noop => ops.push(OpOrConst { op: KerOp::Noop }), } } ops.push(OpOrConst { op: KerOp::Done }); @@ -127,6 +129,7 @@ pub enum KerOp { SwapBC, // jump_to:swap_b_c Floor, // jump_to:floor TwoPowOfInt, // jump_to:two_pow_of_int + Noop // jump_to:noop } #[derive(Clone)] diff --git a/linalg/src/generic/activations.rs b/linalg/src/generic/activations.rs index 1cbb8c02ee..c77f40dfc6 100644 --- a/linalg/src/generic/activations.rs +++ b/linalg/src/generic/activations.rs @@ -70,6 +70,7 @@ unsafe fn compute_slice(ops: *const OpOrConst, xs: *mut f32, len: usize) { KerOp::TwoPowOfInt => { a.iter_mut().for_each(|x| *x = f32::from_bits((((*x as i32) + 127) as u32) << 23)) } + KerOp::Noop => {}, } pc = pc.add(1); }