Skip to content

Commit

Permalink
Implement Neon SIMD
Browse files Browse the repository at this point in the history
Closes #46
  • Loading branch information
CryZe authored Aug 24, 2022
1 parent 375ab05 commit 3457f25
Show file tree
Hide file tree
Showing 10 changed files with 480 additions and 119 deletions.
27 changes: 26 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ env:
CARGO_TERM_COLOR: always

jobs:
build:
x86:
runs-on: ubuntu-20.04
strategy:
matrix:
Expand Down Expand Up @@ -77,3 +77,28 @@ jobs:
env:
RUSTFLAGS: -Ctarget-feature=+simd128,+bulk-memory,+nontrapping-fptoint,+sign-ext
run: cargo test --target wasm32-wasi

aarch64:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v2

- name: Install toolchain
uses: actions-rs/toolchain@v1
with:
toolchain: stable
override: true
target: aarch64-unknown-linux-gnu

- name: Install cross
run: cargo install cross

- name: Build with minimal features (no_std)
run: cross build --target aarch64-unknown-linux-gnu --verbose --no-default-features --features no-std-float

- name: Run tests without SIMD
run: cross test --target aarch64-unknown-linux-gnu --verbose --no-default-features --features png-format

- name: Run tests with Neon
run: cross test --target aarch64-unknown-linux-gnu
5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@ default = ["std", "simd", "png-format"]
std = ["tiny-skia-path/std"]
no-std-float = ["tiny-skia-path/no-std-float"]

# Enables x86 SIMD instructions from SSE up to AVX2.
# Has no effect on non-x86 targets. Present mainly for testing.
# Enables SIMD instructions on x86 (from SSE up to AVX2), WebAssembly (SIMD128)
# and Aarch64 (Neon).
# Has no effect other targets. Present mainly for testing.
simd = []

# Allows loading and saving `Pixmap` as PNG.
Expand Down
18 changes: 13 additions & 5 deletions path/src/f32x2_t.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,22 +36,22 @@ impl f32x2 {
/// Returns a minimum value.
pub fn min(self, other: f32x2) -> f32x2 {
f32x2([
self.x().min(other.x()),
self.y().min(other.y()),
pmin(self.x(), other.x()),
pmin(self.y(), other.y()),
])
}

/// Returns a maximum value.
pub fn max(self, other: f32x2) -> f32x2 {
f32x2([
self.x().max(other.x()),
self.y().max(other.y()),
pmax(self.x(), other.x()),
pmax(self.y(), other.y()),
])
}

/// Returns a maximum of both values.
pub fn max_component(self) -> f32 {
self.x().max(self.y())
pmax(self.x(), self.y())
}

/// Returns the first value.
Expand Down Expand Up @@ -104,3 +104,11 @@ impl core::ops::Div<f32x2> for f32x2 {
])
}
}

fn pmax(a: f32, b: f32) -> f32 {
if a < b { b } else { a }
}

fn pmin(a: f32, b: f32) -> f32 {
if b < a { b } else { a }
}
6 changes: 3 additions & 3 deletions src/pipeline/highp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ blend_fn2!(color_burn, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
d + s * inv(da),
s.cmp_eq(f32x8::default()).blend(
d * inv(sa),
sa * (da - da.min((da - d) * sa * s.recip())) + s * inv(da) + d * inv(sa)
sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa)
)
)
);
Expand All @@ -426,7 +426,7 @@ blend_fn2!(color_dodge, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
s * inv(da),
s.cmp_eq(sa).blend(
s + d * inv(sa),
sa * da.min((d * sa) * (sa - s).recip()) + s * inv(da) + d * inv(sa)
sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa)
)
)
);
Expand Down Expand Up @@ -456,7 +456,7 @@ blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| {
// 3. light src, light dst?
let dark_src = d * (sa + (s2 - sa) * (f32x8::splat(1.0) - m));
let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(1.0)) + f32x8::splat(7.0) * m;
let lite_dst = m.recip_sqrt().recip() - m;
let lite_dst = m.sqrt() - m;
let lite_src = d * sa + da * (s2 - sa)
* two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); // 2 or 3?

Expand Down
8 changes: 4 additions & 4 deletions src/wide/f32x16_t.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ impl f32x16 {

pub fn floor(&self) -> Self {
// Yes, Skia does it in the same way.
let roundtrip = self.round_int();
let roundtrip = self.round();
roundtrip - roundtrip.cmp_gt(self).blend(f32x16::splat(1.0), f32x16::splat(0.0))
}

Expand All @@ -85,10 +85,10 @@ impl f32x16 {
])
}

pub fn round_int(&self) -> Self {
pub fn round(&self) -> Self {
Self([
self.0[0].round_int().to_f32x8(),
self.0[1].round_int().to_f32x8(),
self.0[0].round(),
self.0[1].round(),
])
}

Expand Down
50 changes: 40 additions & 10 deletions src/wide/f32x4_t.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ cfg_if::cfg_if! {
#[derive(Clone, Copy, Debug)]
#[repr(transparent)]
pub struct f32x4(v128);
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
use core::arch::aarch64::*;

#[derive(Clone, Copy, Debug)]
#[repr(C, align(16))]
pub struct f32x4(float32x4_t);
} else {
#[derive(Clone, Copy, Debug)]
#[repr(C, align(16))]
Expand All @@ -40,34 +46,46 @@ impl f32x4 {
}

pub fn max(self, rhs: Self) -> Self {
// These technically don't have the same semantics for NaN and 0, but it
// doesn't seem to matter as Skia does it the same way.
cfg_if::cfg_if! {
if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
Self(unsafe { _mm_max_ps(self.0, rhs.0) })
} else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
Self(f32x4_max(self.0, rhs.0))
Self(f32x4_pmax(self.0, rhs.0))
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
unsafe {
Self(vmaxq_f32(self.0, rhs.0))
}
} else {
Self([
self.0[0].max(rhs.0[0]),
self.0[1].max(rhs.0[1]),
self.0[2].max(rhs.0[2]),
self.0[3].max(rhs.0[3]),
super::pmax(self.0[0], rhs.0[0]),
super::pmax(self.0[1], rhs.0[1]),
super::pmax(self.0[2], rhs.0[2]),
super::pmax(self.0[3], rhs.0[3]),
])
}
}
}

pub fn min(self, rhs: Self) -> Self {
// These technically don't have the same semantics for NaN and 0, but it
// doesn't seem to matter as Skia does it the same way.
cfg_if::cfg_if! {
if #[cfg(all(feature = "simd", target_feature = "sse2"))] {
Self(unsafe { _mm_min_ps(self.0, rhs.0) })
} else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
Self(f32x4_min(self.0, rhs.0))
Self(f32x4_pmin(self.0, rhs.0))
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
unsafe {
Self(vminq_f32(self.0, rhs.0))
}
} else {
Self([
self.0[0].min(rhs.0[0]),
self.0[1].min(rhs.0[1]),
self.0[2].min(rhs.0[2]),
self.0[3].min(rhs.0[3]),
super::pmin(self.0[0], rhs.0[0]),
super::pmin(self.0[1], rhs.0[1]),
super::pmin(self.0[2], rhs.0[2]),
super::pmin(self.0[3], rhs.0[3]),
])
}
}
Expand Down Expand Up @@ -95,6 +113,10 @@ impl core::ops::Add for f32x4 {
Self(unsafe { _mm_add_ps(self.0, rhs.0) })
} else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
Self(f32x4_add(self.0, rhs.0))
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
unsafe {
Self(vaddq_f32(self.0, rhs.0))
}
} else {
Self([
self.0[0] + rhs.0[0],
Expand Down Expand Up @@ -122,6 +144,10 @@ impl core::ops::Sub for f32x4 {
Self(unsafe { _mm_sub_ps(self.0, rhs.0) })
} else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
Self(f32x4_sub(self.0, rhs.0))
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
unsafe {
Self(vsubq_f32(self.0, rhs.0))
}
} else {
Self([
self.0[0] - rhs.0[0],
Expand All @@ -143,6 +169,10 @@ impl core::ops::Mul for f32x4 {
Self(unsafe { _mm_mul_ps(self.0, rhs.0) })
} else if #[cfg(all(feature = "simd", target_feature = "simd128"))] {
Self(f32x4_mul(self.0, rhs.0))
} else if #[cfg(all(feature = "simd", target_arch = "aarch64", target_feature = "neon"))] {
unsafe {
Self(vmulq_f32(self.0, rhs.0))
}
} else {
Self([
self.0[0] * rhs.0[0],
Expand Down
Loading

0 comments on commit 3457f25

Please sign in to comment.