diff --git a/src/alpha/u16x2/mod.rs b/src/alpha/u16x2/mod.rs index 161c17a..64c203e 100644 --- a/src/alpha/u16x2/mod.rs +++ b/src/alpha/u16x2/mod.rs @@ -11,6 +11,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl AlphaMulDiv for U16x2 { fn multiply_alpha( @@ -25,6 +27,8 @@ impl AlphaMulDiv for U16x2 { CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_image, dst_image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_image, dst_image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha(src_image, dst_image) }, _ => native::multiply_alpha(src_image, dst_image), } } @@ -37,6 +41,8 @@ impl AlphaMulDiv for U16x2 { CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha_inplace(image) }, _ => native::multiply_alpha_inplace(image), } } @@ -53,6 +59,8 @@ impl AlphaMulDiv for U16x2 { CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_image, dst_image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::divide_alpha(src_image, dst_image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha(src_image, dst_image) }, _ => native::divide_alpha(src_image, dst_image), } } @@ -65,6 +73,8 @@ impl AlphaMulDiv for U16x2 { CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha_inplace(image) }, _ => native::divide_alpha_inplace(image), } } diff --git a/src/alpha/u16x2/wasm32.rs b/src/alpha/u16x2/wasm32.rs new file mode 100644 index 0000000..aa18633 --- /dev/null +++ b/src/alpha/u16x2/wasm32.rs @@ -0,0 +1,212 @@ +use std::arch::wasm32::*; + +use crate::pixels::U16x2; +use crate::utils::foreach_with_pre_reading; +use crate::{ImageView, ImageViewMut}; + +use super::native; + +pub(crate) unsafe fn multiply_alpha( + src_image: &ImageView, + dst_image: &mut ImageViewMut, +) { + let src_rows = src_image.iter_rows(0); + let dst_rows = dst_image.iter_rows_mut(); + + for (src_row, dst_row) in src_rows.zip(dst_rows) { + multiply_alpha_row(src_row, dst_row); + } +} + +pub(crate) unsafe fn multiply_alpha_inplace(image: &mut ImageViewMut) { + for row in image.iter_rows_mut() { + multiply_alpha_row_inplace(row); + } +} + +#[inline] +pub(crate) unsafe fn multiply_alpha_row(src_row: &[U16x2], dst_row: &mut [U16x2]) { + let src_chunks = src_row.chunks_exact(4); + let src_remainder = src_chunks.remainder(); + let mut dst_chunks = dst_row.chunks_exact_mut(4); + let src_dst = src_chunks.zip(&mut dst_chunks); + foreach_with_pre_reading( + src_dst, + |(src, dst)| { + let pixels = v128_load(src.as_ptr() as *const v128); + let dst_ptr = dst.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = multiplies_alpha_4_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + if !src_remainder.is_empty() { + let dst_reminder = dst_chunks.into_remainder(); + native::multiply_alpha_row(src_remainder, dst_reminder); + } +} + +#[inline] +pub(crate) unsafe fn multiply_alpha_row_inplace(row: &mut [U16x2]) { + let mut chunks = row.chunks_exact_mut(4); + foreach_with_pre_reading( + &mut chunks, + |chunk| { + let pixels = v128_load(chunk.as_ptr() as *const v128); + let dst_ptr = chunk.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = multiplies_alpha_4_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + let reminder = chunks.into_remainder(); + if !reminder.is_empty() { + native::multiply_alpha_row_inplace(reminder); + } +} + +#[inline] +unsafe fn multiplies_alpha_4_pixels(pixels: v128) -> v128 { + const HALF: v128 = i32x4(0x8000, 0x8000, 0x8000, 0x8000); + + const MAX_ALPHA: v128 = u32x4(0xffff0000u32, 0xffff0000u32, 0xffff0000u32, 0xffff0000u32); + /* + |L0 A0 | |L1 A1 | |L2 A2 | |L3 A3 | + |0001 0203| |0405 0607| |0809 1011| |1213 1415| + */ + const FACTOR_MASK: v128 = i8x16(2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15); + + let factor_pixels = u8x16_swizzle(pixels, FACTOR_MASK); + let factor_pixels = v128_or(factor_pixels, MAX_ALPHA); + + let src_u32_lo = u32x4_extend_low_u16x8(pixels); + let factors = u32x4_extend_low_u16x8(factor_pixels); + let src_i32_lo = i32x4_add(i32x4_mul(src_u32_lo, factors), HALF); + let dst_i32_lo = i32x4_add(src_i32_lo, u32x4_shr(src_i32_lo, 16)); + let dst_i32_lo = u32x4_shr(dst_i32_lo, 16); + + let src_u32_hi = u32x4_extend_high_u16x8(pixels); + let factors = u32x4_extend_high_u16x8(factor_pixels); + let src_i32_hi = i32x4_add(i32x4_mul(src_u32_hi, factors), HALF); + let dst_i32_hi = i32x4_add(src_i32_hi, u32x4_shr(src_i32_hi, 16)); + let dst_i32_hi = u32x4_shr(dst_i32_hi, 16); + + u16x8_narrow_i32x4(dst_i32_lo, dst_i32_hi) +} + +// Divide + +pub(crate) unsafe fn divide_alpha( + src_image: &ImageView, + dst_image: &mut ImageViewMut, +) { + let src_rows = src_image.iter_rows(0); + let dst_rows = dst_image.iter_rows_mut(); + + for (src_row, dst_row) in src_rows.zip(dst_rows) { + divide_alpha_row(src_row, dst_row); + } +} + +pub(crate) unsafe fn divide_alpha_inplace(image: &mut ImageViewMut) { + for row in image.iter_rows_mut() { + divide_alpha_row_inplace(row); + } +} + +pub(crate) unsafe fn divide_alpha_row(src_row: &[U16x2], dst_row: &mut [U16x2]) { + let src_chunks = src_row.chunks_exact(4); + let src_remainder = src_chunks.remainder(); + let mut dst_chunks = dst_row.chunks_exact_mut(4); + let src_dst = src_chunks.zip(&mut dst_chunks); + foreach_with_pre_reading( + src_dst, + |(src, dst)| { + let pixels = v128_load(src.as_ptr() as *const v128); + let dst_ptr = dst.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = divide_alpha_4_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + if !src_remainder.is_empty() { + let dst_reminder = dst_chunks.into_remainder(); + let mut src_pixels = [U16x2::new([0, 0]); 4]; + src_pixels + .iter_mut() + .zip(src_remainder) + .for_each(|(d, s)| *d = *s); + + let mut dst_pixels = [U16x2::new([0, 0]); 4]; + let mut pixels = v128_load(src_pixels.as_ptr() as *const v128); + pixels = divide_alpha_4_pixels(pixels); + v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels); + + dst_pixels + .iter() + .zip(dst_reminder) + .for_each(|(s, d)| *d = *s); + } +} + +pub(crate) unsafe fn divide_alpha_row_inplace(row: &mut [U16x2]) { + let mut chunks = row.chunks_exact_mut(4); + // Using a simple for-loop in this case is faster than implementation with pre-reading + for chunk in &mut chunks { + let mut pixels = v128_load(chunk.as_ptr() as *const v128); + pixels = divide_alpha_4_pixels(pixels); + v128_store(chunk.as_mut_ptr() as *mut v128, pixels); + } + + let reminder = chunks.into_remainder(); + if !reminder.is_empty() { + let mut src_pixels = [U16x2::new([0, 0]); 4]; + src_pixels + .iter_mut() + .zip(reminder.iter()) + .for_each(|(d, s)| *d = *s); + + let mut dst_pixels = [U16x2::new([0, 0]); 4]; + let mut pixels = v128_load(src_pixels.as_ptr() as *const v128); + pixels = divide_alpha_4_pixels(pixels); + v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels); + + dst_pixels.iter().zip(reminder).for_each(|(s, d)| *d = *s); + } +} + +#[inline] +unsafe fn divide_alpha_4_pixels(pixels: v128) -> v128 { + const ALPHA_MASK: v128 = u32x4(0xffff0000u32, 0xffff0000u32, 0xffff0000u32, 0xffff0000u32); + const LUMA_MASK: v128 = i32x4(0xffff, 0xffff, 0xffff, 0xffff); + const ALPHA_MAX: v128 = f32x4(65535.0, 65535.0, 65535.0, 65535.0); + const ALPHA_SCALE_MAX: v128 = f32x4(2147483648f32, 2147483648f32, 2147483648f32, 2147483648f32); + /* + |L0 A0 | |L1 A1 | |L2 A2 | |L3 A3 | + |0001 0203| |0405 0607| |0809 1011| |1213 1415| + */ + const ALPHA32_SH: v128 = i8x16(2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1); + + let alpha_f32x4 = f32x4_convert_i32x4(u8x16_swizzle(pixels, ALPHA32_SH)); + let luma_f32x4 = f32x4_convert_i32x4(v128_and(pixels, LUMA_MASK)); + let scaled_luma_f32x4 = f32x4_mul(luma_f32x4, ALPHA_MAX); + let divided_luma_u32x4 = u32x4_trunc_sat_f32x4(f32x4_pmin( + f32x4_div(scaled_luma_f32x4, alpha_f32x4), + ALPHA_SCALE_MAX, + )); + + let alpha = v128_and(pixels, ALPHA_MASK); + u8x16_shuffle::<0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31>( + divided_luma_u32x4, + alpha, + ) +} diff --git a/src/alpha/u16x4/mod.rs b/src/alpha/u16x4/mod.rs index 2ea82f2..9b90d8f 100644 --- a/src/alpha/u16x4/mod.rs +++ b/src/alpha/u16x4/mod.rs @@ -11,6 +11,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl AlphaMulDiv for U16x4 { fn multiply_alpha( @@ -25,6 +27,8 @@ impl AlphaMulDiv for U16x4 { CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_image, dst_image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_image, dst_image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha(src_image, dst_image) }, _ => native::multiply_alpha(src_image, dst_image), } } @@ -37,6 +41,8 @@ impl AlphaMulDiv for U16x4 { CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha_inplace(image) }, _ => native::multiply_alpha_inplace(image), } } @@ -53,6 +59,8 @@ impl AlphaMulDiv for U16x4 { CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_image, dst_image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::divide_alpha(src_image, dst_image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha(src_image, dst_image) }, _ => native::divide_alpha(src_image, dst_image), } } @@ -65,6 +73,8 @@ impl AlphaMulDiv for U16x4 { CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha_inplace(image) }, _ => native::divide_alpha_inplace(image), } } diff --git a/src/alpha/u16x4/wasm32.rs b/src/alpha/u16x4/wasm32.rs new file mode 100644 index 0000000..7b68f44 --- /dev/null +++ b/src/alpha/u16x4/wasm32.rs @@ -0,0 +1,220 @@ +use std::arch::wasm32::*; + +use crate::pixels::U16x4; +use crate::utils::foreach_with_pre_reading; +use crate::{ImageView, ImageViewMut}; + +use super::native; + +pub(crate) unsafe fn multiply_alpha( + src_image: &ImageView, + dst_image: &mut ImageViewMut, +) { + let src_rows = src_image.iter_rows(0); + let dst_rows = dst_image.iter_rows_mut(); + + for (src_row, dst_row) in src_rows.zip(dst_rows) { + multiply_alpha_row(src_row, dst_row); + } +} + +pub(crate) unsafe fn multiply_alpha_inplace(image: &mut ImageViewMut) { + for row in image.iter_rows_mut() { + multiply_alpha_row_inplace(row); + } +} + +#[inline] +pub(crate) unsafe fn multiply_alpha_row(src_row: &[U16x4], dst_row: &mut [U16x4]) { + let src_chunks = src_row.chunks_exact(2); + let src_remainder = src_chunks.remainder(); + let mut dst_chunks = dst_row.chunks_exact_mut(2); + let src_dst = src_chunks.zip(&mut dst_chunks); + foreach_with_pre_reading( + src_dst, + |(src, dst)| { + let pixels = v128_load(src.as_ptr() as *const v128); + let dst_ptr = dst.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = multiply_alpha_2_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + if !src_remainder.is_empty() { + let dst_reminder = dst_chunks.into_remainder(); + native::multiply_alpha_row(src_remainder, dst_reminder); + } +} + +#[inline] +pub(crate) unsafe fn multiply_alpha_row_inplace(row: &mut [U16x4]) { + let mut chunks = row.chunks_exact_mut(2); + foreach_with_pre_reading( + &mut chunks, + |chunk| { + let pixels = v128_load(chunk.as_ptr() as *const v128); + let dst_ptr = chunk.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = multiply_alpha_2_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + let remainder = chunks.into_remainder(); + if !remainder.is_empty() { + native::multiply_alpha_row_inplace(remainder); + } +} + +#[inline] +unsafe fn multiply_alpha_2_pixels(pixels: v128) -> v128 { + let zero = i64x2_splat(0); + let half = i32x4_splat(0x8000); + const MAX_A: i64 = 0xffff000000000000u64 as i64; + let max_alpha = i64x2_splat(MAX_A); + /* + |R0 G0 B0 A0 | |R1 G1 B1 A1 | + |0001 0203 0405 0607| |0809 1011 1213 1415| + */ + const FACTOR_MASK: v128 = i8x16(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15); + + let factor_pixels = u8x16_swizzle(pixels, FACTOR_MASK); + let factor_pixels = v128_or(factor_pixels, max_alpha); + + let src_i32_lo = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(pixels, zero); + let factors = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(factor_pixels, zero); + let src_i32_lo = i32x4_add(i32x4_mul(src_i32_lo, factors), half); + let dst_i32_lo = i32x4_add(src_i32_lo, u32x4_shr(src_i32_lo, 16)); + let dst_i32_lo = u32x4_shr(dst_i32_lo, 16); + + let src_i32_hi = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(pixels, zero); + let factors = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(factor_pixels, zero); + let src_i32_hi = i32x4_add(i32x4_mul(src_i32_hi, factors), half); + let dst_i32_hi = i32x4_add(src_i32_hi, u32x4_shr(src_i32_hi, 16)); + let dst_i32_hi = u32x4_shr(dst_i32_hi, 16); + + u16x8_narrow_i32x4(dst_i32_lo, dst_i32_hi) +} + +// Divide + +pub(crate) unsafe fn divide_alpha( + src_image: &ImageView, + dst_image: &mut ImageViewMut, +) { + let src_rows = src_image.iter_rows(0); + let dst_rows = dst_image.iter_rows_mut(); + + for (src_row, dst_row) in src_rows.zip(dst_rows) { + divide_alpha_row(src_row, dst_row); + } +} + +pub(crate) unsafe fn divide_alpha_inplace(image: &mut ImageViewMut) { + for row in image.iter_rows_mut() { + divide_alpha_row_inplace(row); + } +} + +pub(crate) unsafe fn divide_alpha_row(src_row: &[U16x4], dst_row: &mut [U16x4]) { + let src_chunks = src_row.chunks_exact(2); + let src_remainder = src_chunks.remainder(); + let mut dst_chunks = dst_row.chunks_exact_mut(2); + let src_dst = src_chunks.zip(&mut dst_chunks); + foreach_with_pre_reading( + src_dst, + |(src, dst)| { + let pixels = v128_load(src.as_ptr() as *const v128); + let dst_ptr = dst.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = divide_alpha_2_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + if let Some(src) = src_remainder.first() { + let src_pixels = [*src, U16x4::new([0, 0, 0, 0])]; + let mut dst_pixels = [U16x4::new([0, 0, 0, 0]); 2]; + + let mut pixels = v128_load(src_pixels.as_ptr() as *const v128); + pixels = divide_alpha_2_pixels(pixels); + v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels); + + let dst_reminder = dst_chunks.into_remainder(); + if let Some(dst) = dst_reminder.get_mut(0) { + *dst = dst_pixels[0]; + } + } +} + +pub(crate) unsafe fn divide_alpha_row_inplace(row: &mut [U16x4]) { + let mut chunks = row.chunks_exact_mut(2); + foreach_with_pre_reading( + &mut chunks, + |chunk| { + let pixels = v128_load(chunk.as_ptr() as *const v128); + let dst_ptr = chunk.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = divide_alpha_2_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + let reminder = chunks.into_remainder(); + if let Some(pixel) = reminder.first_mut() { + let src_pixels = [*pixel, U16x4::new([0, 0, 0, 0])]; + let mut dst_pixels = [U16x4::new([0, 0, 0, 0]); 2]; + + let mut pixels = v128_load(src_pixels.as_ptr() as *const v128); + pixels = divide_alpha_2_pixels(pixels); + v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels); + *pixel = dst_pixels[0]; + } +} + +#[inline] +unsafe fn divide_alpha_2_pixels(pixels: v128) -> v128 { + let zero = i64x2_splat(0); + let alpha_mask = i64x2_splat(0xffff000000000000u64 as i64); + let alpha_max = f32x4_splat(65535.0); + let alpha_scale_max = f32x4_splat(2147483648f32); + /* + |R0 G0 B0 A0 | |R1 G1 B1 A1 | + |0001 0203 0405 0607| |0809 1011 1213 1415| + */ + const ALPHA32_SH0: v128 = i8x16(6, 7, -1, -1, 6, 7, -1, -1, 6, 7, -1, -1, 6, 7, -1, -1); + const ALPHA32_SH1: v128 = i8x16( + 14, 15, -1, -1, 14, 15, -1, -1, 14, 15, -1, -1, 14, 15, -1, -1, + ); + + let alpha0_f32x4 = f32x4_convert_i32x4(u8x16_swizzle(pixels, ALPHA32_SH0)); + let alpha1_f32x4 = f32x4_convert_i32x4(u8x16_swizzle(pixels, ALPHA32_SH1)); + + let pix0_f32x4 = f32x4_convert_i32x4(i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(pixels, zero)); + let pix1_f32x4 = f32x4_convert_i32x4(i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(pixels, zero)); + + let scaled_pix0_f32x4 = f32x4_mul(pix0_f32x4, alpha_max); + let scaled_pix1_f32x4 = f32x4_mul(pix1_f32x4, alpha_max); + + let divided_pix0_i32x4 = u32x4_trunc_sat_f32x4(f32x4_pmin( + f32x4_div(scaled_pix0_f32x4, alpha0_f32x4), + alpha_scale_max, + )); + let divided_pix1_i32x4 = u32x4_trunc_sat_f32x4(f32x4_pmin( + f32x4_div(scaled_pix1_f32x4, alpha1_f32x4), + alpha_scale_max, + )); + + let two_pixels_i16x8 = u16x8_narrow_i32x4(divided_pix0_i32x4, divided_pix1_i32x4); + let alpha = v128_and(pixels, alpha_mask); + u8x16_shuffle::<0, 1, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 12, 13, 30, 31>(two_pixels_i16x8, alpha) +} diff --git a/src/alpha/u8x2/mod.rs b/src/alpha/u8x2/mod.rs index 1014d97..11dbee5 100644 --- a/src/alpha/u8x2/mod.rs +++ b/src/alpha/u8x2/mod.rs @@ -11,6 +11,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl AlphaMulDiv for U8x2 { fn multiply_alpha( @@ -25,6 +27,8 @@ impl AlphaMulDiv for U8x2 { CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_image, dst_image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_image, dst_image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha(src_image, dst_image) }, _ => native::multiply_alpha(src_image, dst_image), } } @@ -37,6 +41,8 @@ impl AlphaMulDiv for U8x2 { CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha_inplace(image) }, _ => native::multiply_alpha_inplace(image), } } @@ -53,6 +59,8 @@ impl AlphaMulDiv for U8x2 { CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_image, dst_image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::divide_alpha(src_image, dst_image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha(src_image, dst_image) }, _ => native::divide_alpha(src_image, dst_image), } } @@ -65,6 +73,8 @@ impl AlphaMulDiv for U8x2 { CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha_inplace(image) }, _ => native::divide_alpha_inplace(image), } } diff --git a/src/alpha/u8x2/wasm32.rs b/src/alpha/u8x2/wasm32.rs new file mode 100644 index 0000000..0c51690 --- /dev/null +++ b/src/alpha/u8x2/wasm32.rs @@ -0,0 +1,233 @@ +use std::arch::wasm32::*; + +use crate::pixels::U8x2; +use crate::utils::foreach_with_pre_reading; +use crate::{ImageView, ImageViewMut}; + +use super::native; + +pub(crate) unsafe fn multiply_alpha( + src_image: &ImageView, + dst_image: &mut ImageViewMut, +) { + let src_rows = src_image.iter_rows(0); + let dst_rows = dst_image.iter_rows_mut(); + + for (src_row, dst_row) in src_rows.zip(dst_rows) { + multiply_alpha_row(src_row, dst_row); + } +} + +pub(crate) unsafe fn multiply_alpha_inplace(image: &mut ImageViewMut) { + for row in image.iter_rows_mut() { + multiply_alpha_row_inplace(row); + } +} + +#[inline] +pub(crate) unsafe fn multiply_alpha_row(src_row: &[U8x2], dst_row: &mut [U8x2]) { + let src_chunks = src_row.chunks_exact(8); + let src_remainder = src_chunks.remainder(); + let mut dst_chunks = dst_row.chunks_exact_mut(8); + let src_dst = src_chunks.zip(&mut dst_chunks); + foreach_with_pre_reading( + src_dst, + |(src, dst)| { + let pixels = v128_load(src.as_ptr() as *const v128); + let dst_ptr = dst.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = multiplies_alpha_8_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + if !src_remainder.is_empty() { + let dst_reminder = dst_chunks.into_remainder(); + native::multiply_alpha_row(src_remainder, dst_reminder); + } +} + +#[inline] +pub(crate) unsafe fn multiply_alpha_row_inplace(row: &mut [U8x2]) { + let mut chunks = row.chunks_exact_mut(8); + // Using a simple for-loop in this case is faster than implementation with pre-reading + for chunk in &mut chunks { + let src_pixels = v128_load(chunk.as_ptr() as *const v128); + let dst_pixels = multiplies_alpha_8_pixels(src_pixels); + v128_store(chunk.as_mut_ptr() as *mut v128, dst_pixels); + } + + let reminder = chunks.into_remainder(); + if !reminder.is_empty() { + native::multiply_alpha_row_inplace(reminder); + } +} + +#[inline] +unsafe fn multiplies_alpha_8_pixels(pixels: v128) -> v128 { + let zero = i64x2_splat(0); + let half = i16x8_splat(128); + const MAX_A: i16 = 0xff00u16 as i16; + let max_alpha = i16x8_splat(MAX_A); + /* + |L A | |L A | |L A | |L A | |L A | |L A | |L A | |L A | + |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15| + */ + const FACTOR_MASK: v128 = i8x16(1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); + + let factor_pixels = i8x16_swizzle(pixels, FACTOR_MASK); + let factor_pixels = v128_or(factor_pixels, max_alpha); + + let src_i16_lo = + i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(pixels, zero); + let factors = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + factor_pixels, + zero, + ); + let src_i16_lo = i16x8_add(i16x8_mul(src_i16_lo, factors), half); + let dst_i16_lo = i16x8_add(src_i16_lo, u16x8_shr(src_i16_lo, 8)); + let dst_i16_lo = u16x8_shr(dst_i16_lo, 8); + + let src_i16_hi = + i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(pixels, zero); + let factors = i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( + factor_pixels, + zero, + ); + let src_i16_hi = i16x8_add(i16x8_mul(src_i16_hi, factors), half); + let dst_i16_hi = i16x8_add(src_i16_hi, u16x8_shr(src_i16_hi, 8)); + let dst_i16_hi = u16x8_shr(dst_i16_hi, 8); + + u8x16_narrow_i16x8(dst_i16_lo, dst_i16_hi) +} + +// Divide + +pub(crate) unsafe fn divide_alpha(src_image: &ImageView, dst_image: &mut ImageViewMut) { + let src_rows = src_image.iter_rows(0); + let dst_rows = dst_image.iter_rows_mut(); + + for (src_row, dst_row) in src_rows.zip(dst_rows) { + divide_alpha_row(src_row, dst_row); + } +} + +pub(crate) unsafe fn divide_alpha_inplace(image: &mut ImageViewMut) { + for row in image.iter_rows_mut() { + divide_alpha_row_inplace(row); + } +} + +#[inline] +pub(crate) unsafe fn divide_alpha_row(src_row: &[U8x2], dst_row: &mut [U8x2]) { + let src_chunks = src_row.chunks_exact(8); + let src_remainder = src_chunks.remainder(); + let mut dst_chunks = dst_row.chunks_exact_mut(8); + let src_dst = src_chunks.zip(&mut dst_chunks); + foreach_with_pre_reading( + src_dst, + |(src, dst)| { + let pixels = v128_load(src.as_ptr() as *const v128); + let dst_ptr = dst.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = divide_alpha_8_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + if !src_remainder.is_empty() { + let dst_reminder = dst_chunks.into_remainder(); + let mut src_pixels = [U8x2::new(0); 8]; + src_pixels + .iter_mut() + .zip(src_remainder) + .for_each(|(d, s)| *d = *s); + + let mut dst_pixels = [U8x2::new(0); 8]; + let mut pixels = v128_load(src_pixels.as_ptr() as *const v128); + pixels = divide_alpha_8_pixels(pixels); + v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels); + + dst_pixels + .iter() + .zip(dst_reminder) + .for_each(|(s, d)| *d = *s); + } +} + +#[inline] +pub(crate) unsafe fn divide_alpha_row_inplace(row: &mut [U8x2]) { + let mut chunks = row.chunks_exact_mut(8); + foreach_with_pre_reading( + &mut chunks, + |chunk| { + let pixels = v128_load(chunk.as_ptr() as *const v128); + let dst_ptr = chunk.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = divide_alpha_8_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + let reminder = chunks.into_remainder(); + if !reminder.is_empty() { + let mut src_pixels = [U8x2::new(0); 8]; + src_pixels + .iter_mut() + .zip(reminder.iter()) + .for_each(|(d, s)| *d = *s); + + let mut dst_pixels = [U8x2::new(0); 8]; + let mut pixels = v128_load(src_pixels.as_ptr() as *const v128); + pixels = divide_alpha_8_pixels(pixels); + v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels); + + dst_pixels.iter().zip(reminder).for_each(|(s, d)| *d = *s); + } +} + +#[inline] +unsafe fn divide_alpha_8_pixels(pixels: v128) -> v128 { + let alpha_mask = i16x8_splat(0xff00u16 as i16); + let luma_mask = i16x8_splat(0xff); + const ALPHA32_SH_LO: v128 = i8x16(1, -1, -1, -1, 3, -1, -1, -1, 5, -1, -1, -1, 7, -1, -1, -1); + const ALPHA32_SH_HI: v128 = i8x16( + 9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1, + ); + let alpha_scale = f32x4_splat(255.0 * 256.0); + // sse4 _mm_cvtps_epi32 converts inf to i32::MIN or 2147483648f32 u32. + // wasm32 u32x4_trunc_sat_f32x4 on AVX systems converts inf to u32::MAX. + // Tests pass without capping inf from dividing by zero, but scaled values will not match sse4, + // and other potential test cases will (probably?) break. + let alpha_scale_max = f32x4_splat(2147483648f32); + + let alpha_lo_f32 = f32x4_convert_u32x4(i8x16_swizzle(pixels, ALPHA32_SH_LO)); + // trunc_sat will always round down. Adding f32x4_nearest would match _mm_cvtps_epi32 exactly, + // but would add extra instructions. + let scaled_alpha_lo_u32 = u32x4_trunc_sat_f32x4(f32x4_pmin( + f32x4_div(alpha_scale, alpha_lo_f32), + alpha_scale_max, + )); + let alpha_hi_f32 = f32x4_convert_u32x4(i8x16_swizzle(pixels, ALPHA32_SH_HI)); + let scaled_alpha_hi_u32 = u32x4_trunc_sat_f32x4(f32x4_pmin( + f32x4_div(alpha_scale, alpha_hi_f32), + alpha_scale_max, + )); + let scaled_alpha_u16 = u16x8_narrow_i32x4(scaled_alpha_lo_u32, scaled_alpha_hi_u32); + + let luma_u16 = v128_and(pixels, luma_mask); + let scaled_luma_u16 = u16x8_mul(luma_u16, scaled_alpha_u16); + let scaled_luma_u16 = u16x8_shr(scaled_luma_u16, 8); + + let alpha = v128_and(pixels, alpha_mask); + u8x16_shuffle::<0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31>( + scaled_luma_u16, + alpha, + ) +} diff --git a/src/alpha/u8x4/mod.rs b/src/alpha/u8x4/mod.rs index 9682f48..1f529a0 100644 --- a/src/alpha/u8x4/mod.rs +++ b/src/alpha/u8x4/mod.rs @@ -11,6 +11,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl AlphaMulDiv for U8x4 { fn multiply_alpha( @@ -25,6 +27,8 @@ impl AlphaMulDiv for U8x4 { CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_image, dst_image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_image, dst_image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha(src_image, dst_image) }, _ => native::multiply_alpha(src_image, dst_image), } } @@ -37,6 +41,8 @@ impl AlphaMulDiv for U8x4 { CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha_inplace(image) }, _ => native::multiply_alpha_inplace(image), } } @@ -53,6 +59,8 @@ impl AlphaMulDiv for U8x4 { CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_image, dst_image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::divide_alpha(src_image, dst_image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha(src_image, dst_image) }, _ => native::divide_alpha(src_image, dst_image), } } @@ -65,6 +73,8 @@ impl AlphaMulDiv for U8x4 { CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image) }, #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image) }, + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha_inplace(image) }, _ => native::divide_alpha_inplace(image), } } diff --git a/src/alpha/u8x4/wasm32.rs b/src/alpha/u8x4/wasm32.rs new file mode 100644 index 0000000..502c4b6 --- /dev/null +++ b/src/alpha/u8x4/wasm32.rs @@ -0,0 +1,220 @@ +use std::arch::wasm32::*; + +use crate::pixels::U8x4; +use crate::utils::foreach_with_pre_reading; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +use super::native; + +pub(crate) unsafe fn multiply_alpha( + src_image: &ImageView, + dst_image: &mut ImageViewMut, +) { + let src_rows = src_image.iter_rows(0); + let dst_rows = dst_image.iter_rows_mut(); + + for (src_row, dst_row) in src_rows.zip(dst_rows) { + multiply_alpha_row(src_row, dst_row); + } +} + +pub(crate) unsafe fn multiply_alpha_inplace(image: &mut ImageViewMut) { + for row in image.iter_rows_mut() { + multiply_alpha_row_inplace(row); + } +} + +#[inline] +pub(crate) unsafe fn multiply_alpha_row(src_row: &[U8x4], dst_row: &mut [U8x4]) { + let src_chunks = src_row.chunks_exact(4); + let src_remainder = src_chunks.remainder(); + let mut dst_chunks = dst_row.chunks_exact_mut(4); + let src_dst = src_chunks.zip(&mut dst_chunks); + foreach_with_pre_reading( + src_dst, + |(src, dst)| { + let pixels = v128_load(src.as_ptr() as *const v128); + let dst_ptr = dst.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = multiply_alpha_4_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + if !src_remainder.is_empty() { + let dst_reminder = dst_chunks.into_remainder(); + native::multiply_alpha_row(src_remainder, dst_reminder); + } +} + +#[inline] +pub(crate) unsafe fn multiply_alpha_row_inplace(row: &mut [U8x4]) { + let mut chunks = row.chunks_exact_mut(4); + // Using a simple for-loop in this case is faster than implementation with pre-reading + for chunk in &mut chunks { + let mut pixels = v128_load(chunk.as_ptr() as *const v128); + pixels = multiply_alpha_4_pixels(pixels); + v128_store(chunk.as_mut_ptr() as *mut v128, pixels); + } + + let tail = chunks.into_remainder(); + if !tail.is_empty() { + native::multiply_alpha_row_inplace(tail); + } +} + +#[inline] +unsafe fn multiply_alpha_4_pixels(pixels: v128) -> v128 { + let zero = i64x2_splat(0); + let half = i16x8_splat(128); + const MAX_A: u32 = 0xff000000u32; + let max_alpha = u32x4_splat(MAX_A); + + const FACTOR_MASK: v128 = i8x16(3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15); + + let factor_pixels = u8x16_swizzle(pixels, FACTOR_MASK); + let factor_pixels = v128_or(factor_pixels, max_alpha); + + let pix1 = + i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(pixels, zero); + let factors = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + factor_pixels, + zero, + ); + let pix1 = i16x8_add(i16x8_mul(pix1, factors), half); + let pix1 = i16x8_add(pix1, u16x8_shr(pix1, 8)); + let pix1 = u16x8_shr(pix1, 8); + + let pix2 = + i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(pixels, zero); + let factors = i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( + factor_pixels, + zero, + ); + let pix2 = i16x8_add(i16x8_mul(pix2, factors), half); + let pix2 = i16x8_add(pix2, u16x8_shr(pix2, 8)); + let pix2 = u16x8_shr(pix2, 8); + + u8x16_narrow_i16x8(pix1, pix2) +} + +// Divide + +pub(crate) unsafe fn divide_alpha(src_image: &ImageView, dst_image: &mut ImageViewMut) { + let src_rows = src_image.iter_rows(0); + let dst_rows = dst_image.iter_rows_mut(); + for (src_row, dst_row) in src_rows.zip(dst_rows) { + divide_alpha_row(src_row, dst_row); + } +} + +pub(crate) unsafe fn divide_alpha_inplace(image: &mut ImageViewMut) { + for row in image.iter_rows_mut() { + divide_alpha_row_inplace(row); + } +} + +#[inline] +pub(crate) unsafe fn divide_alpha_row(src_row: &[U8x4], dst_row: &mut [U8x4]) { + let src_chunks = src_row.chunks_exact(4); + let src_remainder = src_chunks.remainder(); + let mut dst_chunks = dst_row.chunks_exact_mut(4); + let src_dst = src_chunks.zip(&mut dst_chunks); + foreach_with_pre_reading( + src_dst, + |(src, dst)| { + let pixels = v128_load(src.as_ptr() as *const v128); + let dst_ptr = dst.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = divide_alpha_4_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + if !src_remainder.is_empty() { + let dst_reminder = dst_chunks.into_remainder(); + let mut src_buffer = [U8x4::new(0); 4]; + src_buffer + .iter_mut() + .zip(src_remainder) + .for_each(|(d, s)| *d = *s); + + let mut dst_buffer = [U8x4::new(0); 4]; + let src_pixels = v128_load(src_buffer.as_ptr() as *const v128); + let dst_pixels = divide_alpha_4_pixels(src_pixels); + v128_store(dst_buffer.as_mut_ptr() as *mut v128, dst_pixels); + + dst_buffer + .iter() + .zip(dst_reminder) + .for_each(|(s, d)| *d = *s); + } +} + +#[inline] +pub(crate) unsafe fn divide_alpha_row_inplace(row: &mut [U8x4]) { + let mut chunks = row.chunks_exact_mut(4); + foreach_with_pre_reading( + &mut chunks, + |chunk| { + let pixels = v128_load(chunk.as_ptr() as *const v128); + let dst_ptr = chunk.as_mut_ptr() as *mut v128; + (pixels, dst_ptr) + }, + |(mut pixels, dst_ptr)| { + pixels = divide_alpha_4_pixels(pixels); + v128_store(dst_ptr, pixels); + }, + ); + + let tail = chunks.into_remainder(); + if !tail.is_empty() { + let mut src_buffer = [U8x4::new(0); 4]; + src_buffer + .iter_mut() + .zip(tail.iter()) + .for_each(|(d, s)| *d = *s); + + let mut dst_buffer = [U8x4::new(0); 4]; + let src_pixels = v128_load(src_buffer.as_ptr() as *const v128); + let dst_pixels = divide_alpha_4_pixels(src_pixels); + v128_store(dst_buffer.as_mut_ptr() as *mut v128, dst_pixels); + + dst_buffer.iter().zip(tail).for_each(|(s, d)| *d = *s); + } +} + +#[inline] +unsafe fn divide_alpha_4_pixels(src_pixels: v128) -> v128 { + let zero = i64x2_splat(0); + let alpha_mask = i32x4_splat(0xff000000u32 as i32); + const SHUFFLE1: v128 = i8x16(0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4, 5, 4, 5); + const SHUFFLE2: v128 = i8x16(8, 9, 8, 9, 8, 9, 8, 9, 12, 13, 12, 13, 12, 13, 12, 13); + let alpha_scale = f32x4_splat(255.0 * 256.0); + let alpha_scale_max = f32x4_splat(2147483648f32); + + let alpha_f32 = f32x4_convert_i32x4(u32x4_shr(src_pixels, 24)); + let scaled_alpha_f32 = f32x4_div(alpha_scale, alpha_f32); + let scaled_alpha_u32 = u32x4_trunc_sat_f32x4(f32x4_pmin(scaled_alpha_f32, alpha_scale_max)); + let mma0 = u8x16_swizzle(scaled_alpha_u32, SHUFFLE1); + let mma1 = u8x16_swizzle(scaled_alpha_u32, SHUFFLE2); + + let pix0 = + u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(zero, src_pixels); + let pix1 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( + zero, src_pixels, + ); + + let pix0 = wasm32_utils::u16x8_mul_hi(pix0, mma0); + let pix1 = wasm32_utils::u16x8_mul_hi(pix1, mma1); + + let alpha = v128_and(src_pixels, alpha_mask); + let rgb = u8x16_narrow_i16x8(pix0, pix1); + + u8x16_shuffle::<0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31>(rgb, alpha) +} diff --git a/src/convolution/u16x1/mod.rs b/src/convolution/u16x1/mod.rs index 4e9e9bb..3bb33b6 100644 --- a/src/convolution/u16x1/mod.rs +++ b/src/convolution/u16x1/mod.rs @@ -12,6 +12,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl Convolution for U16 { fn horiz_convolution( @@ -28,6 +30,10 @@ impl Convolution for U16 { CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs), #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs), + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => { + wasm32::horiz_convolution(src_image, dst_image, offset, coeffs) + } _ => native::horiz_convolution(src_image, dst_image, offset, coeffs), } } diff --git a/src/convolution/u16x1/wasm32.rs b/src/convolution/u16x1/wasm32.rs new file mode 100644 index 0000000..9e2c7aa --- /dev/null +++ b/src/convolution/u16x1/wasm32.rs @@ -0,0 +1,284 @@ +use std::arch::wasm32::*; + +use crate::convolution::{optimisations, Coefficients}; +use crate::pixels::U16; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +#[inline] +pub(crate) fn horiz_convolution( + src_image: &ImageView, + dst_image: &mut ImageViewMut, + offset: u32, + coeffs: Coefficients, +) { + let normalizer = optimisations::Normalizer32::new(coeffs); + let coefficients_chunks = normalizer.normalized_chunks(); + let dst_height = dst_image.height().get(); + + let src_iter = src_image.iter_4_rows(offset, dst_height + offset); + let dst_iter = dst_image.iter_4_rows_mut(); + for (src_rows, dst_rows) in src_iter.zip(dst_iter) { + unsafe { + horiz_convolution_four_rows(src_rows, dst_rows, &coefficients_chunks, &normalizer); + } + } + + let mut yy = dst_height - dst_height % 4; + while yy < dst_height { + unsafe { + horiz_convolution_one_row( + src_image.get_row(yy + offset).unwrap(), + dst_image.get_row_mut(yy).unwrap(), + &coefficients_chunks, + &normalizer, + ); + } + yy += 1; + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - length of all rows in src_rows must be equal +/// - length of all rows in dst_rows must be equal +/// - coefficients_chunks.len() == dst_rows.0.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_four_rows( + src_rows: [&[U16]; 4], + dst_rows: [&mut &mut [U16]; 4], + coefficients_chunks: &[optimisations::CoefficientsI32Chunk], + normalizer: &optimisations::Normalizer32, +) { + let precision = normalizer.precision(); + let half_error = 1i64 << (precision - 1); + let mut ll_buf = [0i64; 2]; + + /* + |L0 | |L1 | |L2 | |L3 | |L4 | |L5 | |L6 | |L7 | + |0001| |0203| |0405| |0607| |0809| |1011| |1213| |1415| + + Shuffle to extract L0 and L1 as i64: + 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L2 and L3 as i64: + 4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L4 and L5 as i64: + 8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L6 and L7 as i64: + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1 + */ + + const L0L1_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1); + const L2L3_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1); + const L4L5_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1); + const L6L7_SHUFFLE: v128 = i8x16( + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, + ); + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x: usize = coeffs_chunk.start as usize; + let mut ll_sum: [v128; 4] = [i64x2_splat(0i64); 4]; + + let mut coeffs = coeffs_chunk.values; + + let coeffs_by_8 = coeffs.chunks_exact(8); + coeffs = coeffs_by_8.remainder(); + + for k in coeffs_by_8 { + let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64); + let coeff23_i64x2 = i64x2(k[2] as i64, k[3] as i64); + let coeff45_i64x2 = i64x2(k[4] as i64, k[5] as i64); + let coeff67_i64x2 = i64x2(k[6] as i64, k[7] as i64); + + for i in 0..4 { + let mut sum = ll_sum[i]; + let source = wasm32_utils::load_v128(src_rows[i], x); + + let l0l1_i64x2 = i8x16_swizzle(source, L0L1_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l0l1_i64x2, coeff01_i64x2)); + + let l2l3_i64x2 = i8x16_swizzle(source, L2L3_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l2l3_i64x2, coeff23_i64x2)); + + let l4l5_i64x2 = i8x16_swizzle(source, L4L5_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l4l5_i64x2, coeff45_i64x2)); + + let l6l7_i64x2 = i8x16_swizzle(source, L6L7_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l6l7_i64x2, coeff67_i64x2)); + + ll_sum[i] = sum; + } + x += 8; + } + + let coeffs_by_4 = coeffs.chunks_exact(4); + coeffs = coeffs_by_4.remainder(); + + for k in coeffs_by_4 { + let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64); + let coeff23_i64x2 = i64x2(k[2] as i64, k[3] as i64); + + for i in 0..4 { + let mut sum = ll_sum[i]; + let source = wasm32_utils::load_v128(src_rows[i], x); + + let l0l1_i64x2 = i8x16_swizzle(source, L0L1_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l0l1_i64x2, coeff01_i64x2)); + + let l2l3_i64x2 = i8x16_swizzle(source, L2L3_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l2l3_i64x2, coeff23_i64x2)); + + ll_sum[i] = sum; + } + x += 4; + } + + let coeffs_by_2 = coeffs.chunks_exact(2); + coeffs = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64); + for i in 0..4 { + let source = wasm32_utils::load_v128(src_rows[i], x); + let l_i64x2 = i8x16_swizzle(source, L0L1_SHUFFLE); + ll_sum[i] = i64x2_add( + ll_sum[i], + wasm32_utils::i64x2_mul_lo(l_i64x2, coeff01_i64x2), + ); + } + x += 2; + } + + if let Some(&k) = coeffs.first() { + let coeff01_i64x2 = i64x2(k as i64, 0); + for i in 0..4 { + let pixel = (*src_rows[i].get_unchecked(x)).0 as i64; + let source = i64x2(pixel, 0); + ll_sum[i] = i64x2_add(ll_sum[i], wasm32_utils::i64x2_mul_lo(source, coeff01_i64x2)); + } + } + + for i in 0..4 { + v128_store((&mut ll_buf).as_mut_ptr() as *mut v128, ll_sum[i]); + let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x); + dst_pixel.0 = normalizer.clip(ll_buf.iter().sum::() + half_error); + } + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - bounds.len() == dst_row.len() +/// - coefficients_chunks.len() == dst_row.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len() +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_one_row( + src_row: &[U16], + dst_row: &mut [U16], + coefficients_chunks: &[optimisations::CoefficientsI32Chunk], + normalizer: &optimisations::Normalizer32, +) { + let precision = normalizer.precision(); + let half_error = 1i64 << (precision - 1); + let mut ll_buf = [0i64; 2]; + + /* + |L0 | |L1 | |L2 | |L3 | |L4 | |L5 | |L6 | |L7 | + |0001| |0203| |0405| |0607| |0809| |1011| |1213| |1415| + + Shuffle to extract L0 and L1 as i64: + 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L2 and L3 as i64: + 4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L4 and L5 as i64: + 8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L6 and L7 as i64: + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1 + */ + + const L01_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1); + const L23_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1); + const L45_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1); + const L67_SHUFFLE: v128 = i8x16( + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, + ); + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x: usize = coeffs_chunk.start as usize; + let mut ll_sum = i64x2_splat(0); + let mut coeffs = coeffs_chunk.values; + + let coeffs_by_8 = coeffs.chunks_exact(8); + coeffs = coeffs_by_8.remainder(); + + for k in coeffs_by_8 { + let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64); + let coeff23_i64x2 = i64x2(k[2] as i64, k[3] as i64); + let coeff45_i64x2 = i64x2(k[4] as i64, k[5] as i64); + let coeff67_i64x2 = i64x2(k[6] as i64, k[7] as i64); + + let source = wasm32_utils::load_v128(src_row, x); + + let l_i64x2 = i8x16_swizzle(source, L01_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff01_i64x2)); + + let l_i64x2 = i8x16_swizzle(source, L23_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff23_i64x2)); + + let l_i64x2 = i8x16_swizzle(source, L45_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff45_i64x2)); + + let l_i64x2 = i8x16_swizzle(source, L67_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff67_i64x2)); + + x += 8; + } + + let coeffs_by_4 = coeffs.chunks_exact(4); + coeffs = coeffs_by_4.remainder(); + + for k in coeffs_by_4 { + let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64); + let coeff23_i64x2 = i64x2(k[2] as i64, k[3] as i64); + + let source = wasm32_utils::load_v128(src_row, x); + + let l_i64x2 = i8x16_swizzle(source, L01_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff01_i64x2)); + + let l_i64x2 = i8x16_swizzle(source, L23_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff23_i64x2)); + + x += 4; + } + + let coeffs_by_2 = coeffs.chunks_exact(2); + coeffs = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64); + let source = wasm32_utils::load_v128(src_row, x); + + let l_i64x2 = i8x16_swizzle(source, L01_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff01_i64x2)); + + x += 2; + } + + if let Some(&k) = coeffs.first() { + let coeff01_i64x2 = i64x2(k as i64, 0); + let pixel = (*src_row.get_unchecked(x)).0 as i64; + let source = i64x2(pixel, 0); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(source, coeff01_i64x2)); + } + + v128_store((&mut ll_buf).as_mut_ptr() as *mut v128, ll_sum); + let dst_pixel = dst_row.get_unchecked_mut(dst_x); + dst_pixel.0 = normalizer.clip(ll_buf[0] + ll_buf[1] + half_error); + } +} diff --git a/src/convolution/u16x2/mod.rs b/src/convolution/u16x2/mod.rs index 8d830f6..7e4e8c3 100644 --- a/src/convolution/u16x2/mod.rs +++ b/src/convolution/u16x2/mod.rs @@ -12,6 +12,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl Convolution for U16x2 { fn horiz_convolution( @@ -28,6 +30,10 @@ impl Convolution for U16x2 { CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs), #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs), + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => { + wasm32::horiz_convolution(src_image, dst_image, offset, coeffs) + } _ => native::horiz_convolution(src_image, dst_image, offset, coeffs), } } diff --git a/src/convolution/u16x2/wasm32.rs b/src/convolution/u16x2/wasm32.rs new file mode 100644 index 0000000..769784a --- /dev/null +++ b/src/convolution/u16x2/wasm32.rs @@ -0,0 +1,259 @@ +use std::arch::wasm32::*; + +use crate::convolution::{optimisations, Coefficients}; +use crate::pixels::U16x2; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +#[inline] +pub(crate) fn horiz_convolution( + src_image: &ImageView, + dst_image: &mut ImageViewMut, + offset: u32, + coeffs: Coefficients, +) { + let normalizer = optimisations::Normalizer32::new(coeffs); + let coefficients_chunks = normalizer.normalized_chunks(); + let dst_height = dst_image.height().get(); + + let src_iter = src_image.iter_4_rows(offset, dst_height + offset); + let dst_iter = dst_image.iter_4_rows_mut(); + for (src_rows, dst_rows) in src_iter.zip(dst_iter) { + unsafe { + horiz_convolution_four_rows(src_rows, dst_rows, &coefficients_chunks, &normalizer); + } + } + + let mut yy = dst_height - dst_height % 4; + while yy < dst_height { + unsafe { + horiz_convolution_one_row( + src_image.get_row(yy + offset).unwrap(), + dst_image.get_row_mut(yy).unwrap(), + &coefficients_chunks, + &normalizer, + ); + } + yy += 1; + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - length of all rows in src_rows must be equal +/// - length of all rows in dst_rows must be equal +/// - coefficients_chunks.len() == dst_rows.0.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() +/// - precision <= MAX_COEFS_PRECISION +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_four_rows( + src_rows: [&[U16x2]; 4], + dst_rows: [&mut &mut [U16x2]; 4], + coefficients_chunks: &[optimisations::CoefficientsI32Chunk], + normalizer: &optimisations::Normalizer32, +) { + let precision = normalizer.precision(); + let half_error = 1i64 << (precision - 1); + let mut ll_buf = [0i64; 2]; + + /* + |L0 A0 | |L1 A1 | |L2 A2 | |L3 A3 | + |0001 0203| |0405 0607| |0809 1011| |1213 1415| + + Shuffle to extract L0 and A0 as i64: + 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L1 and A1 as i64: + 4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L2 and A2 as i64: + 8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L3 and A3 as i64: + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1 + */ + + const P0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1); + const P1_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1); + const P2_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1); + const P3_SHUFFLE: v128 = i8x16( + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, + ); + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x: usize = coeffs_chunk.start as usize; + let mut ll_sum = [i64x2_splat(half_error); 4]; + + let mut coeffs = coeffs_chunk.values; + + let coeffs_by_4 = coeffs.chunks_exact(4); + coeffs = coeffs_by_4.remainder(); + + for k in coeffs_by_4 { + let coeff0_i64x2 = i64x2_splat(k[0] as i64); + let coeff1_i64x2 = i64x2_splat(k[1] as i64); + let coeff2_i64x2 = i64x2_splat(k[2] as i64); + let coeff3_i64x2 = i64x2_splat(k[3] as i64); + + for i in 0..4 { + let mut sum = ll_sum[i]; + let source = wasm32_utils::load_v128(src_rows[i], x); + + let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2)); + + let p_i64x2 = i8x16_swizzle(source, P1_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff1_i64x2)); + + let p_i64x2 = i8x16_swizzle(source, P2_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff2_i64x2)); + + let p_i64x2 = i8x16_swizzle(source, P3_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff3_i64x2)); + + ll_sum[i] = sum; + } + x += 4; + } + + let coeffs_by_2 = coeffs.chunks_exact(2); + coeffs = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + let coeff0_i64x2 = i64x2_splat(k[0] as i64); + let coeff1_i64x2 = i64x2_splat(k[1] as i64); + + for i in 0..4 { + let mut sum = ll_sum[i]; + let source = wasm32_utils::loadl_i64(src_rows[i], x); + + let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2)); + + let p_i64x2 = i8x16_swizzle(source, P1_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff1_i64x2)); + + ll_sum[i] = sum; + } + x += 2; + } + + if let Some(&k) = coeffs.first() { + let coeff0_i64x2 = i64x2_splat(k as i64); + for i in 0..4 { + let source = wasm32_utils::loadl_i32(src_rows[i], x); + let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE); + ll_sum[i] = i64x2_add(ll_sum[i], wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2)); + } + } + + for i in 0..4 { + v128_store((&mut ll_buf).as_mut_ptr() as *mut v128, ll_sum[i]); + let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x); + dst_pixel.0 = [normalizer.clip(ll_buf[0]), normalizer.clip(ll_buf[1])]; + } + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - bounds.len() == dst_row.len() +/// - coeffs.len() == dst_rows.0.len() * window_size +/// - max(bound.start + bound.size for bound in bounds) <= src_row.len() +/// - precision <= MAX_COEFS_PRECISION +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_one_row( + src_row: &[U16x2], + dst_row: &mut [U16x2], + coefficients_chunks: &[optimisations::CoefficientsI32Chunk], + normalizer: &optimisations::Normalizer32, +) { + let precision = normalizer.precision(); + let half_error = 1i64 << (precision - 1); + let mut ll_buf = [0i64; 2]; + + /* + |L0 A0 | |L1 A1 | |L2 A2 | |L3 A3 | + |0001 0203| |0405 0607| |0809 1011| |1213 1415| + + Shuffle to extract L0 and A0 as i64: + 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L1 and A1 as i64: + 4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L2 and A2 as i64: + 8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1 + + Shuffle to extract L3 and A3 as i64: + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1 + */ + + const P0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1); + const P1_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1); + const P2_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1); + const P3_SHUFFLE: v128 = i8x16( + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, + ); + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x: usize = coeffs_chunk.start as usize; + let mut ll_sum = i64x2_splat(half_error); + let mut coeffs = coeffs_chunk.values; + + let coeffs_by_4 = coeffs.chunks_exact(4); + coeffs = coeffs_by_4.remainder(); + + for k in coeffs_by_4 { + let coeff0_i64x2 = i64x2_splat(k[0] as i64); + let coeff1_i64x2 = i64x2_splat(k[1] as i64); + let coeff2_i64x2 = i64x2_splat(k[2] as i64); + let coeff3_i64x2 = i64x2_splat(k[3] as i64); + + let source = wasm32_utils::load_v128(src_row, x); + + let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2)); + + let p_i64x2 = i8x16_swizzle(source, P1_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff1_i64x2)); + + let p_i64x2 = i8x16_swizzle(source, P2_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff2_i64x2)); + + let p_i64x2 = i8x16_swizzle(source, P3_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff3_i64x2)); + + x += 4; + } + + let coeffs_by_2 = coeffs.chunks_exact(2); + coeffs = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + let coeff0_i64x2 = i64x2_splat(k[0] as i64); + let coeff1_i64x2 = i64x2_splat(k[1] as i64); + + let source = wasm32_utils::loadl_i64(src_row, x); + + let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2)); + + let p_i64x2 = i8x16_swizzle(source, P1_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff1_i64x2)); + + x += 2; + } + + if let Some(&k) = coeffs.first() { + let coeff0_i64x2 = i64x2_splat(k as i64); + let source = wasm32_utils::loadl_i32(src_row, x); + + let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE); + ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2)); + } + + v128_store((&mut ll_buf).as_mut_ptr() as *mut v128, ll_sum); + let dst_pixel = dst_row.get_unchecked_mut(dst_x); + dst_pixel.0 = [normalizer.clip(ll_buf[0]), normalizer.clip(ll_buf[1])]; + } +} diff --git a/src/convolution/u16x3/mod.rs b/src/convolution/u16x3/mod.rs index 8319e00..bffbd8e 100644 --- a/src/convolution/u16x3/mod.rs +++ b/src/convolution/u16x3/mod.rs @@ -12,6 +12,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl Convolution for U16x3 { fn horiz_convolution( @@ -28,6 +30,10 @@ impl Convolution for U16x3 { CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs), #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs), + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => { + wasm32::horiz_convolution(src_image, dst_image, offset, coeffs) + } _ => native::horiz_convolution(src_image, dst_image, offset, coeffs), } } diff --git a/src/convolution/u16x3/wasm32.rs b/src/convolution/u16x3/wasm32.rs new file mode 100644 index 0000000..c16b7de --- /dev/null +++ b/src/convolution/u16x3/wasm32.rs @@ -0,0 +1,236 @@ +use std::arch::wasm32::*; + +use crate::convolution::optimisations::CoefficientsI32Chunk; +use crate::convolution::{optimisations, Coefficients}; +use crate::pixels::U16x3; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +#[inline] +pub(crate) fn horiz_convolution( + src_image: &ImageView, + dst_image: &mut ImageViewMut, + offset: u32, + coeffs: Coefficients, +) { + let normalizer = optimisations::Normalizer32::new(coeffs); + let coefficients_chunks = normalizer.normalized_chunks(); + let dst_height = dst_image.height().get(); + + let src_iter = src_image.iter_4_rows(offset, dst_height + offset); + let dst_iter = dst_image.iter_4_rows_mut(); + for (src_rows, dst_rows) in src_iter.zip(dst_iter) { + unsafe { + horiz_convolution_8u4x(src_rows, dst_rows, &coefficients_chunks, &normalizer); + } + } + + let mut yy = dst_height - dst_height % 4; + while yy < dst_height { + unsafe { + horiz_convolution_8u( + src_image.get_row(yy + offset).unwrap(), + dst_image.get_row_mut(yy).unwrap(), + &coefficients_chunks, + &normalizer, + ); + } + yy += 1; + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - length of all rows in src_rows must be equal +/// - length of all rows in dst_rows must be equal +/// - coefficients_chunks.len() == dst_rows.0.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() +/// - precision <= MAX_COEFS_PRECISION +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_8u4x( + src_rows: [&[U16x3]; 4], + dst_rows: [&mut &mut [U16x3]; 4], + coefficients_chunks: &[CoefficientsI32Chunk], + normalizer: &optimisations::Normalizer32, +) { + const ZERO: v128 = i64x2(0, 0); + let precision = normalizer.precision(); + let half_error = 1i64 << (precision - 1); + let mut rg_buf = [0i64; 2]; + let mut bb_buf = [0i64; 2]; + + /* + |R G B | |R G B | |R G | + |0001 0203 0405| |0607 0809 1011| |1213 1415| + + Shuffle to extract RG components of first pixel as i64: + 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 + + Shuffle to extract RG components of second pixel as i64: + 6, 7, -1, -1, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, -1, -1 + + Shuffle to extract B components of two pixels as i64: + 4, 5, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1 + + */ + + const RG0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1); + const RG1_SHUFFLE: v128 = i8x16(6, 7, -1, -1, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, -1, -1); + const BB_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1); + + let width = src_rows[0].len(); + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x: usize = coeffs_chunk.start as usize; + let mut rg_sum = [ZERO; 4]; + let mut bb_sum = [ZERO; 4]; + + let mut coeffs = coeffs_chunk.values; + let end_x = x + coeffs.len(); + + if width - end_x >= 1 { + let coeffs_by_2 = coeffs.chunks_exact(2); + coeffs = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + let coeff0_i64x2 = i64x2_splat(k[0] as i64); + let coeff1_i64x2 = i64x2_splat(k[1] as i64); + let coeff_i64x2 = i64x2(k[0] as i64, k[1] as i64); + + for i in 0..4 { + let source = wasm32_utils::load_v128(src_rows[i], x); + + let rg0_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE); + rg_sum[i] = i64x2_add( + rg_sum[i], + wasm32_utils::i64x2_mul_lo(rg0_i64x2, coeff0_i64x2), + ); + + let rg1_i64x2 = i8x16_swizzle(source, RG1_SHUFFLE); + rg_sum[i] = i64x2_add( + rg_sum[i], + wasm32_utils::i64x2_mul_lo(rg1_i64x2, coeff1_i64x2), + ); + + let bb_i64x2 = i8x16_swizzle(source, BB_SHUFFLE); + bb_sum[i] = + i64x2_add(bb_sum[i], wasm32_utils::i64x2_mul_lo(bb_i64x2, coeff_i64x2)); + } + x += 2; + } + } + + for &k in coeffs { + let coeff_i64x2 = i64x2_splat(k as i64); + + for i in 0..4 { + let &pixel = src_rows[i].get_unchecked(x); + let rg_i64x2 = i64x2(pixel.0[0] as i64, pixel.0[1] as i64); + rg_sum[i] = i64x2_add(rg_sum[i], wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff_i64x2)); + let bb_i64x2 = i64x2(pixel.0[2] as i64, 0); + bb_sum[i] = i64x2_add(bb_sum[i], wasm32_utils::i64x2_mul_lo(bb_i64x2, coeff_i64x2)); + } + x += 1; + } + + for i in 0..4 { + v128_store((&mut rg_buf).as_mut_ptr() as *mut v128, rg_sum[i]); + v128_store((&mut bb_buf).as_mut_ptr() as *mut v128, bb_sum[i]); + let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x); + dst_pixel.0[0] = normalizer.clip(rg_buf[0] + half_error); + dst_pixel.0[1] = normalizer.clip(rg_buf[1] + half_error); + dst_pixel.0[2] = normalizer.clip(bb_buf[0] + bb_buf[1] + half_error); + } + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - bounds.len() == dst_row.len() +/// - coefficients_chunks.len() == dst_row.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len() +/// - precision <= MAX_COEFS_PRECISION +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_8u( + src_row: &[U16x3], + dst_row: &mut [U16x3], + coefficients_chunks: &[CoefficientsI32Chunk], + normalizer: &optimisations::Normalizer32, +) { + let precision = normalizer.precision(); + let rg_initial = i64x2_splat(1 << (precision - 1)); + let bb_initial = i64x2_splat(1 << (precision - 2)); + + /* + |R G B | |R G B | |R G | + |0001 0203 0405| |0607 0809 1011| |1213 1415| + + Shuffle to extract RG components of first pixel as i64: + 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 + + Shuffle to extract RG components of second pixel as i64: + 6, 7, -1, -1, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, -1, -1 + + Shuffle to extract B components of two pixels as i64: + 4, 5, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1 + + */ + + const RG0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1); + const RG1_SHUFFLE: v128 = i8x16(6, 7, -1, -1, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, -1, -1); + const BB_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1); + let mut rg_buf = [0i64; 2]; + let mut bb_buf = [0i64; 2]; + + let width = src_row.len(); + + for (dst_x, &coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x: usize = coeffs_chunk.start as usize; + + let mut rg_sum = rg_initial; + let mut bb_sum = bb_initial; + + let mut coeffs = coeffs_chunk.values; + let end_x = x + coeffs.len(); + + if width - end_x >= 1 { + let coeffs_by_2 = coeffs.chunks_exact(2); + coeffs = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + let coeff0_i64x2 = i64x2_splat(k[0] as i64); + let coeff1_i64x2 = i64x2_splat(k[1] as i64); + let coeff_i64x2 = i64x2(k[0] as i64, k[1] as i64); + + let source = wasm32_utils::load_v128(src_row, x); + + let rg0_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE); + rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg0_i64x2, coeff0_i64x2)); + + let rg1_i64x2 = i8x16_swizzle(source, RG1_SHUFFLE); + rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg1_i64x2, coeff1_i64x2)); + + let bb_i64x2 = i8x16_swizzle(source, BB_SHUFFLE); + bb_sum = i64x2_add(bb_sum, wasm32_utils::i64x2_mul_lo(bb_i64x2, coeff_i64x2)); + x += 2; + } + } + + for &k in coeffs { + let coeff_i64x2 = i64x2_splat(k as i64); + + let &pixel = src_row.get_unchecked(x); + let rg_i64x2 = i64x2(pixel.0[0] as i64, pixel.0[1] as i64); + rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff_i64x2)); + let bb_i64x2 = i64x2(pixel.0[2] as i64, 0); + bb_sum = i64x2_add(bb_sum, wasm32_utils::i64x2_mul_lo(bb_i64x2, coeff_i64x2)); + + x += 1; + } + + v128_store((&mut rg_buf).as_mut_ptr() as *mut v128, rg_sum); + v128_store((&mut bb_buf).as_mut_ptr() as *mut v128, bb_sum); + let dst_pixel = dst_row.get_unchecked_mut(dst_x); + dst_pixel.0[0] = normalizer.clip(rg_buf[0]); + dst_pixel.0[1] = normalizer.clip(rg_buf[1]); + dst_pixel.0[2] = normalizer.clip(bb_buf[0] + bb_buf[1]); + } +} diff --git a/src/convolution/u16x4/mod.rs b/src/convolution/u16x4/mod.rs index a0d8f4c..d9df251 100644 --- a/src/convolution/u16x4/mod.rs +++ b/src/convolution/u16x4/mod.rs @@ -12,6 +12,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl Convolution for U16x4 { fn horiz_convolution( @@ -28,6 +30,10 @@ impl Convolution for U16x4 { CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs), #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs), + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => { + wasm32::horiz_convolution(src_image, dst_image, offset, coeffs) + } _ => native::horiz_convolution(src_image, dst_image, offset, coeffs), } } diff --git a/src/convolution/u16x4/wasm32.rs b/src/convolution/u16x4/wasm32.rs new file mode 100644 index 0000000..670efb2 --- /dev/null +++ b/src/convolution/u16x4/wasm32.rs @@ -0,0 +1,236 @@ +use std::arch::wasm32::*; + +use crate::convolution::{optimisations, Coefficients}; +use crate::pixels::U16x4; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +#[inline] +pub(crate) fn horiz_convolution( + src_image: &ImageView, + dst_image: &mut ImageViewMut, + offset: u32, + coeffs: Coefficients, +) { + let normalizer = optimisations::Normalizer32::new(coeffs); + let coefficients_chunks = normalizer.normalized_chunks(); + let dst_height = dst_image.height().get(); + + let src_iter = src_image.iter_4_rows(offset, dst_height + offset); + let dst_iter = dst_image.iter_4_rows_mut(); + for (src_rows, dst_rows) in src_iter.zip(dst_iter) { + unsafe { + horiz_convolution_four_rows(src_rows, dst_rows, &coefficients_chunks, &normalizer); + } + } + + let mut yy = dst_height - dst_height % 4; + while yy < dst_height { + unsafe { + horiz_convolution_one_row( + src_image.get_row(yy + offset).unwrap(), + dst_image.get_row_mut(yy).unwrap(), + &coefficients_chunks, + &normalizer, + ); + } + yy += 1; + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - length of all rows in src_rows must be equal +/// - length of all rows in dst_rows must be equal +/// - coefficients_chunks.len() == dst_rows.0.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() +/// - precision <= MAX_COEFS_PRECISION +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_four_rows( + src_rows: [&[U16x4]; 4], + dst_rows: [&mut &mut [U16x4]; 4], + coefficients_chunks: &[optimisations::CoefficientsI32Chunk], + normalizer: &optimisations::Normalizer32, +) { + let precision = normalizer.precision(); + let half_error = 1i64 << (precision - 1); + let mut rg_buf = [0i64; 2]; + let mut ba_buf = [0i64; 2]; + + /* + |R0 G0 B0 A0 | |R1 G1 B1 A1 | + |0001 0203 0405 0607| |0809 1011 1213 1415| + + Shuffle to extract R0 and G0 as i64: + 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 + + Shuffle to extract R1 and G1 as i64: + 8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1 + + Shuffle to extract B0 and A0 as i64: + 4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1 + + Shuffle to extract B1 and A1 as i64: + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1 + */ + + const RG0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1); + const RG1_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1); + const BA0_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1); + const BA1_SHUFFLE: v128 = i8x16( + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, + ); + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x: usize = coeffs_chunk.start as usize; + let mut rg_sum = [i64x2_splat(half_error); 4]; + let mut ba_sum = [i64x2_splat(half_error); 4]; + + let mut coeffs = coeffs_chunk.values; + + let coeffs_by_2 = coeffs.chunks_exact(2); + coeffs = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + let coeff0_i64x2 = i64x2_splat(k[0] as i64); + let coeff1_i64x2 = i64x2_splat(k[1] as i64); + + for i in 0..4 { + let source = wasm32_utils::load_v128(src_rows[i], x); + let mut sum = rg_sum[i]; + let rg_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2)); + let rg_i64x2 = i8x16_swizzle(source, RG1_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff1_i64x2)); + rg_sum[i] = sum; + + let mut sum = ba_sum[i]; + let ba_i64x2 = i8x16_swizzle(source, BA0_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2)); + let ba_i64x2 = i8x16_swizzle(source, BA1_SHUFFLE); + sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff1_i64x2)); + ba_sum[i] = sum; + } + x += 2; + } + + if let Some(&k) = coeffs.first() { + let coeff0_i64x2 = i64x2_splat(k as i64); + for i in 0..4 { + let source = wasm32_utils::loadl_i64(src_rows[i], x); + let rg_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE); + rg_sum[i] = i64x2_add( + rg_sum[i], + wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2), + ); + let ba_i64x2 = i8x16_swizzle(source, BA0_SHUFFLE); + ba_sum[i] = i64x2_add( + ba_sum[i], + wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2), + ); + } + } + + for i in 0..4 { + v128_store((&mut rg_buf).as_mut_ptr() as *mut v128, rg_sum[i]); + v128_store((&mut ba_buf).as_mut_ptr() as *mut v128, ba_sum[i]); + let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x); + dst_pixel.0 = [ + normalizer.clip(rg_buf[0]), + normalizer.clip(rg_buf[1]), + normalizer.clip(ba_buf[0]), + normalizer.clip(ba_buf[1]), + ]; + } + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - bounds.len() == dst_row.len() +/// - coeffs.len() == dst_rows.0.len() * window_size +/// - max(bound.start + bound.size for bound in bounds) <= src_row.len() +/// - precision <= MAX_COEFS_PRECISION +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_one_row( + src_row: &[U16x4], + dst_row: &mut [U16x4], + coefficients_chunks: &[optimisations::CoefficientsI32Chunk], + normalizer: &optimisations::Normalizer32, +) { + let precision = normalizer.precision(); + let half_error = 1i64 << (precision - 1); + let mut rg_buf = [0i64; 2]; + let mut ba_buf = [0i64; 2]; + + /* + |R0 G0 B0 A0 | |R1 G1 B1 A1 | + |0001 0203 0405 0607| |0809 1011 1213 1415| + + Shuffle to extract R0 and G0 as i64: + 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 + + Shuffle to extract R1 and G1 as i64: + 8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1 + + Shuffle to extract B0 and A0 as i64: + 4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1 + + Shuffle to extract B1 and A1 as i64: + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1 + */ + + const RG0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1); + const RG1_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1); + const BA0_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1); + const BA1_SHUFFLE: v128 = i8x16( + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, + ); + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x: usize = coeffs_chunk.start as usize; + let mut coeffs = coeffs_chunk.values; + let mut rg_sum = i64x2_splat(half_error); + let mut ba_sum = i64x2_splat(half_error); + + let coeffs_by_2 = coeffs.chunks_exact(2); + coeffs = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + let coeff0_i64x2 = i64x2_splat(k[0] as i64); + let coeff1_i64x2 = i64x2_splat(k[1] as i64); + + let source = wasm32_utils::load_v128(src_row, x); + + let rg_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE); + rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2)); + let rg_i64x2 = i8x16_swizzle(source, RG1_SHUFFLE); + rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff1_i64x2)); + + let ba_i64x2 = i8x16_swizzle(source, BA0_SHUFFLE); + ba_sum = i64x2_add(ba_sum, wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2)); + let ba_i64x2 = i8x16_swizzle(source, BA1_SHUFFLE); + ba_sum = i64x2_add(ba_sum, wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff1_i64x2)); + + x += 2; + } + + if let Some(&k) = coeffs.first() { + let coeff0_i64x2 = i64x2_splat(k as i64); + let source = wasm32_utils::loadl_i64(src_row, x); + let rg_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE); + rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2)); + let ba_i64x2 = i8x16_swizzle(source, BA0_SHUFFLE); + ba_sum = i64x2_add(ba_sum, wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2)); + } + + v128_store((&mut rg_buf).as_mut_ptr() as *mut v128, rg_sum); + v128_store((&mut ba_buf).as_mut_ptr() as *mut v128, ba_sum); + let dst_pixel = dst_row.get_unchecked_mut(dst_x); + dst_pixel.0 = [ + normalizer.clip(rg_buf[0]), + normalizer.clip(rg_buf[1]), + normalizer.clip(ba_buf[0]), + normalizer.clip(ba_buf[1]), + ]; + } +} diff --git a/src/convolution/u8x1/mod.rs b/src/convolution/u8x1/mod.rs index 45141cb..5697e49 100644 --- a/src/convolution/u8x1/mod.rs +++ b/src/convolution/u8x1/mod.rs @@ -12,6 +12,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl Convolution for U8 { fn horiz_convolution( @@ -28,6 +30,10 @@ impl Convolution for U8 { CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs), #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs), + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => { + wasm32::horiz_convolution(src_image, dst_image, offset, coeffs) + } _ => native::horiz_convolution(src_image, dst_image, offset, coeffs), } } diff --git a/src/convolution/u8x1/wasm32.rs b/src/convolution/u8x1/wasm32.rs new file mode 100644 index 0000000..517345f --- /dev/null +++ b/src/convolution/u8x1/wasm32.rs @@ -0,0 +1,163 @@ +use std::arch::wasm32::*; + +use crate::convolution::{optimisations, Coefficients}; +use crate::pixels::U8; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +#[inline] +pub(crate) fn horiz_convolution( + src_image: &ImageView, + dst_image: &mut ImageViewMut, + offset: u32, + coeffs: Coefficients, +) { + let normalizer = optimisations::Normalizer16::new(coeffs); + let coefficients_chunks = normalizer.normalized_chunks(); + let dst_height = dst_image.height().get(); + + let src_iter = src_image.iter_4_rows(offset, dst_height + offset); + let dst_iter = dst_image.iter_4_rows_mut(); + for (src_rows, dst_rows) in src_iter.zip(dst_iter) { + unsafe { + horiz_convolution_four_rows(src_rows, dst_rows, &coefficients_chunks, &normalizer); + } + } + + let mut yy = dst_height - dst_height % 4; + while yy < dst_height { + unsafe { + horiz_convolution_row( + src_image.get_row(yy + offset).unwrap(), + dst_image.get_row_mut(yy).unwrap(), + &coefficients_chunks, + &normalizer, + ); + } + yy += 1; + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - length of all rows in src_rows must be equal +/// - length of all rows in dst_rows must be equal +/// - coefficients_chunks.len() == dst_rows.0.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() +/// - precision <= MAX_COEFS_PRECISION +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_four_rows( + src_rows: [&[U8]; 4], + dst_rows: [&mut &mut [U8]; 4], + coefficients_chunks: &[optimisations::CoefficientsI16Chunk], + normalizer: &optimisations::Normalizer16, +) { + const ZERO: v128 = i64x2(0, 0); + let initial = 1 << (normalizer.precision() - 1); + let mut buf = [0, 0, 0, 0, initial]; + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let coeffs = coeffs_chunk.values; + let mut x = coeffs_chunk.start as usize; + let mut result_i32x4 = [ZERO, ZERO, ZERO, ZERO]; + + let coeffs_by_8 = coeffs.chunks_exact(8); + let reminder8 = coeffs_by_8.remainder(); + for k in coeffs_by_8 { + let coeffs_i16x8 = v128_load(k.as_ptr() as *const v128); + for i in 0..4 { + let pixels_u8x8 = wasm32_utils::loadl_i64(src_rows[i], x); + let pixels_i16x8 = u16x8_extend_low_u8x16(pixels_u8x8); + result_i32x4[i] = + i32x4_add(result_i32x4[i], i32x4_dot_i16x8(pixels_i16x8, coeffs_i16x8)); + } + x += 8; + } + + let mut coeffs_by_4 = reminder8.chunks_exact(4); + let reminder4 = coeffs_by_4.remainder(); + if let Some(k) = coeffs_by_4.next() { + let coeffs_i16x4 = wasm32_utils::loadl_i64(k, 0); + for i in 0..4 { + let pixels_u8x4 = wasm32_utils::loadl_i32(src_rows[i], x); + let pixels_i16x4 = u16x8_extend_low_u8x16(pixels_u8x4); + result_i32x4[i] = + i32x4_add(result_i32x4[i], i32x4_dot_i16x8(pixels_i16x4, coeffs_i16x4)); + } + x += 4; + } + + let mut result_i32x4 = result_i32x4.map(|v| { + v128_store(buf.as_mut_ptr() as *mut v128, v); + buf.iter().sum() + }); + + for &coeff in reminder4 { + let coeff_i32 = coeff as i32; + for i in 0..4 { + result_i32x4[i] += src_rows[i].get_unchecked(x).0.to_owned() as i32 * coeff_i32; + } + x += 1; + } + + let result_u8x4 = result_i32x4.map(|v| normalizer.clip(v)); + for i in 0..4 { + dst_rows[i].get_unchecked_mut(dst_x).0 = result_u8x4[i]; + } + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - bounds.len() == dst_row.len() +/// - coeffs.len() == dst_rows.0.len() * window_size +/// - max(bound.start + bound.size for bound in bounds) <= src_row.len() +/// - precision <= MAX_COEFS_PRECISION +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_row( + src_row: &[U8], + dst_row: &mut [U8], + coefficients_chunks: &[optimisations::CoefficientsI16Chunk], + normalizer: &optimisations::Normalizer16, +) { + const ZERO: v128 = i64x2(0, 0); + let initial = 1 << (normalizer.precision() - 1); + let mut buf = [0, 0, 0, 0, initial]; + + for (dst_x, &coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let coeffs = coeffs_chunk.values; + let mut x = coeffs_chunk.start as usize; + let mut result_i32x4 = ZERO; + + let coeffs_by_8 = coeffs.chunks_exact(8); + let reminder8 = coeffs_by_8.remainder(); + for k in coeffs_by_8 { + let coeffs_i16x8 = v128_load(k.as_ptr() as *const v128); + let pixels_u8x8 = wasm32_utils::loadl_i64(src_row, x); + let pixels_i16x8 = u16x8_extend_low_u8x16(pixels_u8x8); + result_i32x4 = i32x4_add(result_i32x4, i32x4_dot_i16x8(pixels_i16x8, coeffs_i16x8)); + x += 8; + } + + let mut coeffs_by_4 = reminder8.chunks_exact(4); + let reminder4 = coeffs_by_4.remainder(); + if let Some(k) = coeffs_by_4.next() { + let coeffs_i16x4 = wasm32_utils::loadl_i64(k, 0); + let pixels_u8x4 = wasm32_utils::loadl_i32(src_row, x); + let pixels_i16x4 = u16x8_extend_low_u8x16(pixels_u8x4); + result_i32x4 = i32x4_add(result_i32x4, i32x4_dot_i16x8(pixels_i16x4, coeffs_i16x4)); + x += 4; + } + + v128_store(buf.as_mut_ptr() as *mut v128, result_i32x4); + let mut result_i32 = buf.iter().sum(); + + for &coeff in reminder4 { + let coeff_i32 = coeff as i32; + result_i32 += src_row.get_unchecked(x).0 as i32 * coeff_i32; + x += 1; + } + + dst_row.get_unchecked_mut(dst_x).0 = normalizer.clip(result_i32); + } +} diff --git a/src/convolution/u8x2/mod.rs b/src/convolution/u8x2/mod.rs index 29c99fc..2197b97 100644 --- a/src/convolution/u8x2/mod.rs +++ b/src/convolution/u8x2/mod.rs @@ -12,6 +12,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl Convolution for U8x2 { fn horiz_convolution( @@ -28,6 +30,10 @@ impl Convolution for U8x2 { CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs), #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs), + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => { + wasm32::horiz_convolution(src_image, dst_image, offset, coeffs) + } _ => native::horiz_convolution(src_image, dst_image, offset, coeffs), } } diff --git a/src/convolution/u8x2/wasm32.rs b/src/convolution/u8x2/wasm32.rs new file mode 100644 index 0000000..fec5f0b --- /dev/null +++ b/src/convolution/u8x2/wasm32.rs @@ -0,0 +1,320 @@ +use std::arch::wasm32::*; + +use crate::convolution::{optimisations, Coefficients}; +use crate::pixels::U8x2; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +#[inline] +pub(crate) fn horiz_convolution( + src_image: &ImageView, + dst_image: &mut ImageViewMut, + offset: u32, + coeffs: Coefficients, +) { + let normalizer = optimisations::Normalizer16::new(coeffs); + let coefficients_chunks = normalizer.normalized_chunks(); + let dst_height = dst_image.height().get(); + + let src_iter = src_image.iter_4_rows(offset, dst_height + offset); + let dst_iter = dst_image.iter_4_rows_mut(); + for (src_rows, dst_rows) in src_iter.zip(dst_iter) { + unsafe { + horiz_convolution_four_rows(src_rows, dst_rows, &coefficients_chunks, &normalizer); + } + } + + let mut yy = dst_height - dst_height % 4; + while yy < dst_height { + unsafe { + horiz_convolution_one_row( + src_image.get_row(yy + offset).unwrap(), + dst_image.get_row_mut(yy).unwrap(), + &coefficients_chunks, + &normalizer, + ); + } + yy += 1; + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - length of all rows in src_rows must be equal +/// - length of all rows in dst_rows must be equal +/// - coefficients_chunks.len() == dst_rows.0.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() +/// - precision <= MAX_COEFS_PRECISION +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_four_rows( + src_rows: [&[U8x2]; 4], + dst_rows: [&mut &mut [U8x2]; 4], + coefficients_chunks: &[optimisations::CoefficientsI16Chunk], + normalizer: &optimisations::Normalizer16, +) { + let precision = normalizer.precision(); + let initial = i32x4_splat(1 << (precision - 2)); + + /* + |L A | |L A | |L A | |L A | |L A | |L A | |L A | |L A | + |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15| + + Shuffle components with converting from u8 into i16: + + A: |-1 07| |-1 05| |-1 03| |-1 01| + L: |-1 06| |-1 04| |-1 02| |-1 00| + */ + #[rustfmt::skip] + const SH1: v128 = i8x16( + 0, -1, 2, -1, 4, -1, 6, -1, 1, -1, 3, -1, 5, -1, 7, -1 + ); + /* + A: |-1 15| |-1 13| |-1 11| |-1 09| + L: |-1 14| |-1 12| |-1 10| |-1 08| + */ + #[rustfmt::skip] + const SH2: v128 = i8x16( + 8, -1, 10, -1, 12, -1, 14, -1, 9, -1, 11, -1, 13, -1, 15, -1 + ); + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x = coeffs_chunk.start as usize; + + let mut sss: [v128; 4] = [initial; 4]; + let coeffs = coeffs_chunk.values; + + let coeffs_by_8 = coeffs.chunks_exact(8); + let reminder = coeffs_by_8.remainder(); + + for k in coeffs_by_8 { + let mmk0 = wasm32_utils::ptr_i16_to_set1_i64(k, 0); + let mmk1 = wasm32_utils::ptr_i16_to_set1_i64(k, 4); + + for i in 0..4 { + let source = wasm32_utils::load_v128(src_rows[i], x); + let pix = i8x16_swizzle(source, SH1); + let tmp_sum = i32x4_add(sss[i], i32x4_dot_i16x8(pix, mmk0)); + let pix = i8x16_swizzle(source, SH2); + sss[i] = i32x4_add(tmp_sum, i32x4_dot_i16x8(pix, mmk1)); + } + x += 8; + } + + let coeffs_by_4 = reminder.chunks_exact(4); + let reminder = coeffs_by_4.remainder(); + + for k in coeffs_by_4 { + let mmk = wasm32_utils::ptr_i16_to_set1_i64(k, 0); + + for i in 0..4 { + let source = wasm32_utils::loadl_i64(src_rows[i], x); + let pix = i8x16_swizzle(source, SH1); + sss[i] = i32x4_add(sss[i], i32x4_dot_i16x8(pix, mmk)); + } + x += 4; + } + + let coeffs_by_2 = reminder.chunks_exact(2); + let reminder = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + let mmk = wasm32_utils::ptr_i16_to_set1_i32(k, 0); + + for i in 0..4 { + let source = wasm32_utils::loadl_i32(src_rows[i], x); + let pix = i8x16_swizzle(source, SH1); + sss[i] = i32x4_add(sss[i], i32x4_dot_i16x8(pix, mmk)); + } + x += 2; + } + + if let Some(&k) = reminder.first() { + let mmk = i32x4_splat(k as i32); + + for i in 0..4 { + let source = wasm32_utils::loadl_i16(src_rows[i], x); + let pix = i8x16_swizzle(source, SH1); + sss[i] = i32x4_add(sss[i], i32x4_dot_i16x8(pix, mmk)); + } + } + + for i in 0..4 { + set_dst_pixel(sss[i], dst_rows[i], dst_x, normalizer); + } + } +} + +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn set_dst_pixel( + raw: v128, + d_row: &mut &mut [U8x2], + dst_x: usize, + normalizer: &optimisations::Normalizer16, +) { + let l32x2 = i64x2_extract_lane::<0>(raw); + let a32x2 = i64x2_extract_lane::<1>(raw); + let l32 = ((l32x2 >> 32) as i32).saturating_add((l32x2 & 0xffffffff) as i32); + let a32 = ((a32x2 >> 32) as i32).saturating_add((a32x2 & 0xffffffff) as i32); + let l8 = normalizer.clip(l32); + let a8 = normalizer.clip(a32); + d_row.get_unchecked_mut(dst_x).0 = u16::from_le_bytes([l8, a8]); +} + +/// For safety, it is necessary to ensure the following conditions: +/// - bounds.len() == dst_row.len() +/// - coeffs.len() == dst_rows.0.len() * window_size +/// - max(bound.start + bound.size for bound in bounds) <= src_row.len() +/// - precision <= MAX_COEFS_PRECISION +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_one_row( + src_row: &[U8x2], + dst_row: &mut [U8x2], + coefficients_chunks: &[optimisations::CoefficientsI16Chunk], + normalizer: &optimisations::Normalizer16, +) { + let precision = normalizer.precision(); + /* + |L A | |L A | |L A | |L A | |L A | |L A | |L A | |L A | + |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15| + + Scale first four pixels into i16: + + A: |-1 07| |-1 05| + L: |-1 06| |-1 04| + A: |-1 03| |-1 01| + L: |-1 02| |-1 00| + */ + #[rustfmt::skip] + const PIX_SH1: v128 = i8x16( + 0, -1, 2, -1, 1, -1, 3, -1, 4, -1, 6, -1, 5, -1, 7, -1 + ); + /* + |C0 | |C1 | |C2 | |C3 | |C4 | |C5 | |C6 | |C7 | + |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15| + + Duplicate first four coefficients for A and L components of pixels: + + CA: |07 06| |05 04| + CL: |07 06| |05 04| + CA: |03 02| |01 00| + CL: |03 02| |01 00| + */ + #[rustfmt::skip] + const COEFF_SH1: v128 = i8x16( + 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7 + ); + + /* + |L A | |L A | |L A | |L A | |L A | |L A | |L A | |L A | + |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15| + + Scale second four pixels into i16: + + A: |-1 15| |-1 13| + L: |-1 14| |-1 12| + A: |-1 11| |-1 09| + L: |-1 10| |-1 08| + */ + #[rustfmt::skip] + const PIX_SH2: v128 = i8x16( + 8, -1, 10, -1, 9, -1, 11, -1, 12, -1, 14, -1, 13, -1, 15, -1 + ); + /* + |C0 | |C1 | |C2 | |C3 | |C4 | |C5 | |C6 | |C7 | + |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15| + + Duplicate second four coefficients for A and L components of pixels: + + CA: |15 14| |13 12| + CL: |15 14| |13 12| + CA: |11 10| |09 08| + CL: |11 10| |09 08| + */ + #[rustfmt::skip] + const COEFF_SH2: v128 = i8x16( + 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15 + ); + + /* + |L A | |L A | |L A | |L A | + |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15| + + Scale four pixels into i16: + + A: |-1 07| |-1 05| + L: |-1 06| |-1 04| + A: |-1 03| |-1 01| + L: |-1 02| |-1 00| + */ + const PIX_SH3: v128 = i8x16(0, -1, 2, -1, 1, -1, 3, -1, 4, -1, 6, -1, 5, -1, 7, -1); + + for (dst_x, &coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x = coeffs_chunk.start as usize; + let mut coeffs = coeffs_chunk.values; + + // Lower part will be added to higher, use only half of the error + let mut sss = i32x4_splat(1 << (precision - 2)); + + let coeffs_by_8 = coeffs.chunks_exact(8); + coeffs = coeffs_by_8.remainder(); + + for k in coeffs_by_8 { + let ksource = wasm32_utils::load_v128(k, 0); + let source = wasm32_utils::load_v128(src_row, x); + + let pix = i8x16_swizzle(source, PIX_SH1); + let mmk = i8x16_swizzle(ksource, COEFF_SH1); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + let pix = i8x16_swizzle(source, PIX_SH2); + let mmk = i8x16_swizzle(ksource, COEFF_SH2); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + x += 8; + } + + let coeffs_by_4 = coeffs.chunks_exact(4); + let reminder1 = coeffs_by_4.remainder(); + + for k in coeffs_by_4 { + let mmk = i16x8(k[0], k[1], k[0], k[1], k[2], k[3], k[2], k[3]); + let source = wasm32_utils::loadl_i64(src_row, x); + let pix = i8x16_swizzle(source, PIX_SH3); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + x += 4 + } + + if !reminder1.is_empty() { + let mut pixels: [i16; 6] = [0; 6]; + let mut coeffs: [i16; 3] = [0; 3]; + for (i, &coeff) in reminder1.iter().enumerate() { + coeffs[i] = coeff; + let pixel: [u8; 2] = (*src_row.get_unchecked(x)).0.to_le_bytes(); + pixels[i * 2] = pixel[0] as i16; + pixels[i * 2 + 1] = pixel[1] as i16; + x += 1; + } + + let pix = i16x8( + pixels[0], pixels[2], pixels[1], pixels[3], pixels[4], 0, pixels[5], 0, + ); + let mmk = i16x8( + coeffs[0], coeffs[1], coeffs[0], coeffs[1], coeffs[2], 0, coeffs[2], 0, + ); + + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + } + + let lo = i64x2_extract_lane::<0>(sss); + let hi = i64x2_extract_lane::<1>(sss); + + let a32 = ((lo >> 32) as i32).saturating_add((hi >> 32) as i32); + let l32 = ((lo & 0xffffffff) as i32).saturating_add((hi & 0xffffffff) as i32); + let a8 = normalizer.clip(a32); + let l8 = normalizer.clip(l32); + dst_row.get_unchecked_mut(dst_x).0 = u16::from_le_bytes([l8, a8]); + } +} diff --git a/src/convolution/u8x3/mod.rs b/src/convolution/u8x3/mod.rs index 01160de..249b6e3 100644 --- a/src/convolution/u8x3/mod.rs +++ b/src/convolution/u8x3/mod.rs @@ -12,6 +12,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl Convolution for U8x3 { fn horiz_convolution( @@ -28,6 +30,10 @@ impl Convolution for U8x3 { CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs), #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs), + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => { + wasm32::horiz_convolution(src_image, dst_image, offset, coeffs) + } _ => native::horiz_convolution(src_image, dst_image, offset, coeffs), } } diff --git a/src/convolution/u8x3/wasm32.rs b/src/convolution/u8x3/wasm32.rs new file mode 100644 index 0000000..9d0e08d --- /dev/null +++ b/src/convolution/u8x3/wasm32.rs @@ -0,0 +1,292 @@ +use std::arch::wasm32::*; +use std::intrinsics::transmute; + +use crate::convolution::{optimisations, Coefficients}; +use crate::pixels::U8x3; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +#[inline] +pub(crate) fn horiz_convolution( + src_image: &ImageView, + dst_image: &mut ImageViewMut, + offset: u32, + coeffs: Coefficients, +) { + let normalizer = optimisations::Normalizer16::new(coeffs); + let precision = normalizer.precision(); + let coefficients_chunks = normalizer.normalized_chunks(); + let dst_height = dst_image.height().get(); + + let src_iter = src_image.iter_4_rows(offset, dst_height + offset); + let dst_iter = dst_image.iter_4_rows_mut(); + for (src_rows, dst_rows) in src_iter.zip(dst_iter) { + unsafe { + horiz_convolution_8u4x(src_rows, dst_rows, &coefficients_chunks, precision); + } + } + + let mut yy = dst_height - dst_height % 4; + while yy < dst_height { + unsafe { + horiz_convolution_8u( + src_image.get_row(yy + offset).unwrap(), + dst_image.get_row_mut(yy).unwrap(), + &coefficients_chunks, + precision, + ); + } + yy += 1; + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - length of all rows in src_rows must be equal +/// - length of all rows in dst_rows must be equal +/// - coefficients_chunks.len() == dst_rows.0.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() +/// - precision <= MAX_COEFS_PRECISION +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_8u4x( + src_rows: [&[U8x3]; 4], + dst_rows: [&mut &mut [U8x3]; 4], + coefficients_chunks: &[optimisations::CoefficientsI16Chunk], + precision: u8, +) { + const ZERO: v128 = i64x2(0, 0); + let initial = i32x4_splat(1 << (precision - 1)); + let src_width = src_rows[0].len(); + + /* + |R G B | |R G B | |R G B | |R G B | |R G B | |R | + |00 01 02| |03 04 05| |06 07 08| |09 10 11| |12 13 14| |15| + + Ignore 12-15 bytes in register and + shuffle other components with converting from u8 into i16: + + x: |-1 -1| |-1 -1| + B: |-1 05| |-1 02| + G: |-1 04| |-1 01| + R: |-1 03| |-1 00| + */ + #[rustfmt::skip] + const SH_LO: v128 = i8x16( + 0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1 + ); + /* + x: |-1 -1| |-1 -1| + B: |-1 11| |-1 08| + G: |-1 10| |-1 07| + R: |-1 09| |-1 06| + */ + #[rustfmt::skip] + const SH_HI: v128 = i8x16( + 6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1 + ); + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let x_start = coeffs_chunk.start as usize; + let mut x = x_start; + + let mut sss_a = [initial; 4]; + let mut coeffs = coeffs_chunk.values; + + // Next block of code will be load source pixels by 16 bytes per time. + // We must guarantee what this process will not go beyond + // the one row of image. + // (16 bytes) / (3 bytes per pixel) = 5 whole pixels + 1 byte + let max_x = src_width.saturating_sub(5); + if x < max_x { + let coeffs_by_4 = coeffs.chunks_exact(4); + + for k in coeffs_by_4 { + let mmk0 = wasm32_utils::ptr_i16_to_set1_i32(k, 0); + let mmk1 = wasm32_utils::ptr_i16_to_set1_i32(k, 2); + for i in 0..4 { + let source = wasm32_utils::load_v128(src_rows[i], x); + let pix = i8x16_swizzle(source, SH_LO); + let mut sss = sss_a[i]; + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk0)); + let pix = i8x16_swizzle(source, SH_HI); + sss_a[i] = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk1)); + } + + x += 4; + if x >= max_x { + break; + } + } + } + + // Next block of code will be load source pixels by 8 bytes per time. + // We must guarantee what this process will not go beyond + // the one row of image. + // (8 bytes) / (3 bytes per pixel) = 2 whole pixels + 2 bytes + let max_x = src_width.saturating_sub(2); + if x < max_x { + let coeffs_by_2 = coeffs[x - x_start..].chunks_exact(2); + + for k in coeffs_by_2 { + let mmk = wasm32_utils::ptr_i16_to_set1_i32(k, 0); + + for i in 0..4 { + let source = wasm32_utils::loadl_i64(src_rows[i], x); + let pix = i8x16_swizzle(source, SH_LO); + sss_a[i] = i32x4_add(sss_a[i], i32x4_dot_i16x8(pix, mmk)); + } + + x += 2; + if x >= max_x { + break; + } + } + } + + coeffs = coeffs.split_at(x - x_start).1; + for &k in coeffs { + let mmk = i32x4_splat(k as i32); + for i in 0..4 { + let pix = wasm32_utils::i32x4_extend_low_ptr_u8x3(src_rows[i], x); + sss_a[i] = i32x4_add(sss_a[i], i32x4_dot_i16x8(pix, mmk)); + } + + x += 1; + } + macro_rules! call { + ($imm8:expr) => {{ + sss_a[0] = i32x4_shr(sss_a[0], $imm8); + sss_a[1] = i32x4_shr(sss_a[1], $imm8); + sss_a[2] = i32x4_shr(sss_a[2], $imm8); + sss_a[3] = i32x4_shr(sss_a[3], $imm8); + }}; + } + constify_imm8!(precision, call); + + for i in 0..4 { + let sss = i16x8_narrow_i32x4(sss_a[i], ZERO); + let pixel: u32 = transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss, ZERO))); + let bytes = pixel.to_le_bytes(); + dst_rows[i].get_unchecked_mut(dst_x).0 = [bytes[0], bytes[1], bytes[2]]; + } + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - bounds.len() == dst_row.len() +/// - coeffs.len() == dst_rows.0.len() * window_size +/// - max(bound.start + bound.size for bound in bounds) <= src_row.len() +/// - precision <= MAX_COEFS_PRECISION +#[inline] +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_8u( + src_row: &[U8x3], + dst_row: &mut [U8x3], + coefficients_chunks: &[optimisations::CoefficientsI16Chunk], + precision: u8, +) { + #[rustfmt::skip] + const PIX_SH1: v128 = i8x16( + 0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1 + ); + #[rustfmt::skip] + const COEF_SH1: v128 = i8x16( + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 + ); + #[rustfmt::skip] + const PIX_SH2: v128 = i8x16( + 6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1 + ); + #[rustfmt::skip] + const COEF_SH2: v128 = i8x16( + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7 + ); + /* + Load 8 bytes from memory into low half of 16-bytes register: + |R G B | |R G B | |R G | + |00 01 02| |03 04 05| |06 07| 08 09 10 11 12 13 14 15 + + Ignore 06-16 bytes in 16-bytes register and + shuffle other components with converting from u8 into i16: + + x: |-1 -1| |-1 -1| + B: |-1 05| |-1 02| + G: |-1 04| |-1 01| + R: |-1 03| |-1 00| + */ + let src_width = src_row.len(); + + for (dst_x, &coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let x_start = coeffs_chunk.start as usize; + let mut x = x_start; + let mut coeffs = coeffs_chunk.values; + let mut sss = i32x4_splat(1 << (precision - 1)); + + // Next block of code will be load source pixels by 16 bytes per time. + // We must guarantee what this process will not go beyond + // the one row of image. + // (16 bytes) / (3 bytes per pixel) = 5 whole pixels + 1 bytes + let max_x = src_width.saturating_sub(5); + if x < max_x { + let coeffs_by_4 = coeffs.chunks_exact(4); + for k in coeffs_by_4 { + let ksource = wasm32_utils::loadl_i64(k, 0); + let source = wasm32_utils::load_v128(src_row, x); + + let pix = i8x16_swizzle(source, PIX_SH1); + let mmk = i8x16_swizzle(ksource, COEF_SH1); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + let pix = i8x16_swizzle(source, PIX_SH2); + let mmk = i8x16_swizzle(ksource, COEF_SH2); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + x += 4; + if x >= max_x { + break; + } + } + } + + // Next block of code will be load source pixels by 8 bytes per time. + // We must guarantee what this process will not go beyond + // the one row of image. + // (8 bytes) / (3 bytes per pixel) = 2 whole pixels + 2 bytes + let max_x = src_width.saturating_sub(2); + if x < max_x { + let coeffs_by_2 = coeffs[x - x_start..].chunks_exact(2); + + for k in coeffs_by_2 { + let mmk = wasm32_utils::ptr_i16_to_set1_i32(k, 0); + let source = wasm32_utils::loadl_i64(src_row, x); + let pix = i8x16_swizzle(source, PIX_SH1); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + x += 2; + if x >= max_x { + break; + } + } + } + + coeffs = coeffs.split_at(x - x_start).1; + for &k in coeffs { + let pix = wasm32_utils::i32x4_extend_low_ptr_u8x3(src_row, x); + let mmk = i32x4_splat(k as i32); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + x += 1; + } + + macro_rules! call { + ($imm8:expr) => {{ + sss = i32x4_shr(sss, $imm8); + }}; + } + constify_imm8!(precision, call); + + sss = i16x8_narrow_i32x4(sss, sss); + let pixel: u32 = transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss, sss))); + let bytes = pixel.to_le_bytes(); + dst_row.get_unchecked_mut(dst_x).0 = [bytes[0], bytes[1], bytes[2]]; + } +} diff --git a/src/convolution/u8x4/mod.rs b/src/convolution/u8x4/mod.rs index ae37998..69a9dc3 100644 --- a/src/convolution/u8x4/mod.rs +++ b/src/convolution/u8x4/mod.rs @@ -12,6 +12,8 @@ mod native; mod neon; #[cfg(target_arch = "x86_64")] mod sse4; +#[cfg(target_arch = "wasm32")] +mod wasm32; impl Convolution for U8x4 { fn horiz_convolution( @@ -28,6 +30,10 @@ impl Convolution for U8x4 { CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs), #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs), + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => { + wasm32::horiz_convolution(src_image, dst_image, offset, coeffs) + } _ => native::horiz_convolution(src_image, dst_image, offset, coeffs), } } diff --git a/src/convolution/u8x4/wasm32.rs b/src/convolution/u8x4/wasm32.rs new file mode 100644 index 0000000..4651b8b --- /dev/null +++ b/src/convolution/u8x4/wasm32.rs @@ -0,0 +1,282 @@ +use std::arch::wasm32::*; +use std::intrinsics::transmute; + +use crate::convolution::{optimisations, Coefficients}; +use crate::pixels::U8x4; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +// This code is based on C-implementation from Pillow-SIMD package for Python +// https://github.com/uploadcare/pillow-simd + +#[inline] +pub(crate) fn horiz_convolution( + src_image: &ImageView, + dst_image: &mut ImageViewMut, + offset: u32, + coeffs: Coefficients, +) { + let normalizer = optimisations::Normalizer16::new(coeffs); + let precision = normalizer.precision(); + let coefficients_chunks = normalizer.normalized_chunks(); + let dst_height = dst_image.height().get(); + + let src_iter = src_image.iter_4_rows(offset, dst_height + offset); + let dst_iter = dst_image.iter_4_rows_mut(); + for (src_rows, dst_rows) in src_iter.zip(dst_iter) { + unsafe { + horiz_convolution_8u4x(src_rows, dst_rows, &coefficients_chunks, precision); + } + } + + let mut yy = dst_height - dst_height % 4; + while yy < dst_height { + unsafe { + horiz_convolution_8u( + src_image.get_row(yy + offset).unwrap(), + dst_image.get_row_mut(yy).unwrap(), + &coefficients_chunks, + precision, + ); + } + yy += 1; + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - length of all rows in src_rows must be equal +/// - length of all rows in dst_rows must be equal +/// - coefficients_chunks.len() == dst_rows.0.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len() +/// - precision <= MAX_COEFS_PRECISION +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_8u4x( + src_rows: [&[U8x4]; 4], + dst_rows: [&mut &mut [U8x4]; 4], + coefficients_chunks: &[optimisations::CoefficientsI16Chunk], + precision: u8, +) { + let initial = i32x4_splat(1 << (precision - 1)); + const MASK_LO: v128 = i8x16(0, -1, 4, -1, 1, -1, 5, -1, 2, -1, 6, -1, 3, -1, 7, -1); + const MASK_HI: v128 = i8x16(8, -1, 12, -1, 9, -1, 13, -1, 10, -1, 14, -1, 11, -1, 15, -1); + const MASK: v128 = i8x16(0, -1, 4, -1, 1, -1, 5, -1, 2, -1, 6, -1, 3, -1, 7, -1); + + for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x: usize = coeffs_chunk.start as usize; + + let mut sss0 = initial; + let mut sss1 = initial; + let mut sss2 = initial; + let mut sss3 = initial; + + let coeffs = coeffs_chunk.values; + let coeffs_by_4 = coeffs.chunks_exact(4); + let reminder1 = coeffs_by_4.remainder(); + + for k in coeffs_by_4 { + let mmk_lo = wasm32_utils::ptr_i16_to_set1_i32(k, 0); + let mmk_hi = wasm32_utils::ptr_i16_to_set1_i32(k, 2); + + // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 + let mut source = wasm32_utils::load_v128(src_rows[0], x); + // [16] a1 a0 b1 b0 g1 g0 r1 r0 + let mut pix = i8x16_swizzle(source, MASK_LO); + sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk_lo)); + // [16] a3 a2 b3 b2 g3 g2 r3 r2 + pix = i8x16_swizzle(source, MASK_HI); + sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk_hi)); + + source = wasm32_utils::load_v128(src_rows[1], x); + pix = i8x16_swizzle(source, MASK_LO); + sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk_lo)); + pix = i8x16_swizzle(source, MASK_HI); + sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk_hi)); + + source = wasm32_utils::load_v128(src_rows[2], x); + pix = i8x16_swizzle(source, MASK_LO); + sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk_lo)); + pix = i8x16_swizzle(source, MASK_HI); + sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk_hi)); + + source = wasm32_utils::load_v128(src_rows[3], x); + pix = i8x16_swizzle(source, MASK_LO); + sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk_lo)); + pix = i8x16_swizzle(source, MASK_HI); + sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk_hi)); + x += 4; + } + + let coeffs_by_2 = reminder1.chunks_exact(2); + let reminder2 = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + // [16] k1 k0 k1 k0 k1 k0 k1 k0 + let mmk = wasm32_utils::ptr_i16_to_set1_i32(k, 0); + + // [8] x x x x x x x x a1 b1 g1 r1 a0 b0 g0 r0 + let mut pix = wasm32_utils::loadl_i64(src_rows[0], x); + // [16] a1 a0 b1 b0 g1 g0 r1 r0 + pix = i8x16_swizzle(pix, MASK); + sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk)); + + pix = wasm32_utils::loadl_i64(src_rows[1], x); + pix = i8x16_swizzle(pix, MASK); + sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk)); + + pix = wasm32_utils::loadl_i64(src_rows[2], x); + pix = i8x16_swizzle(pix, MASK); + sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk)); + + pix = wasm32_utils::loadl_i64(src_rows[3], x); + pix = i8x16_swizzle(pix, MASK); + sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk)); + + x += 2; + } + + if let Some(&k) = reminder2.first() { + // [16] xx k0 xx k0 xx k0 xx k0 + let mmk = i32x4_splat(k as i32); + // [16] xx a0 xx b0 xx g0 xx r0 + let mut pix = wasm32_utils::i32x4_extend_low_ptr_u8x4(src_rows[0], x); + sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk)); + + pix = wasm32_utils::i32x4_extend_low_ptr_u8x4(src_rows[1], x); + sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk)); + + pix = wasm32_utils::i32x4_extend_low_ptr_u8x4(src_rows[2], x); + sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk)); + + pix = wasm32_utils::i32x4_extend_low_ptr_u8x4(src_rows[3], x); + sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk)); + } + + macro_rules! call { + ($imm8:expr) => {{ + sss0 = i32x4_shr(sss0, $imm8); + sss1 = i32x4_shr(sss1, $imm8); + sss2 = i32x4_shr(sss2, $imm8); + sss3 = i32x4_shr(sss3, $imm8); + }}; + } + constify_imm8!(precision, call); + + sss0 = i16x8_narrow_i32x4(sss0, sss0); + sss1 = i16x8_narrow_i32x4(sss1, sss1); + sss2 = i16x8_narrow_i32x4(sss2, sss2); + sss3 = i16x8_narrow_i32x4(sss3, sss3); + *dst_rows[0].get_unchecked_mut(dst_x) = + transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss0, sss0))); + *dst_rows[1].get_unchecked_mut(dst_x) = + transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss1, sss1))); + *dst_rows[2].get_unchecked_mut(dst_x) = + transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss2, sss2))); + *dst_rows[3].get_unchecked_mut(dst_x) = + transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss3, sss3))); + } +} + +/// For safety, it is necessary to ensure the following conditions: +/// - bounds.len() == dst_row.len() +/// - coefficients_chunks.len() == dst_row.len() +/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len() +/// - precision <= MAX_COEFS_PRECISION +#[target_feature(enable = "simd128")] +unsafe fn horiz_convolution_8u( + src_row: &[U8x4], + dst_row: &mut [U8x4], + coefficients_chunks: &[optimisations::CoefficientsI16Chunk], + precision: u8, +) { + let initial = i32x4_splat(1 << (precision - 1)); + const SH1: v128 = i8x16(0, -1, 8, -1, 1, -1, 9, -1, 2, -1, 10, -1, 3, -1, 11, -1); + const SH2: v128 = i8x16(0, 1, 4, 5, 0, 1, 4, 5, 0, 1, 4, 5, 0, 1, 4, 5); + const SH3: v128 = i8x16(4, -1, 12, -1, 5, -1, 13, -1, 6, -1, 14, -1, 7, -1, 15, -1); + const SH4: v128 = i8x16(2, 3, 6, 7, 2, 3, 6, 7, 2, 3, 6, 7, 2, 3, 6, 7); + const SH5: v128 = i8x16(8, 9, 12, 13, 8, 9, 12, 13, 8, 9, 12, 13, 8, 9, 12, 13); + const SH6: v128 = i8x16( + 10, 11, 14, 15, 10, 11, 14, 15, 10, 11, 14, 15, 10, 11, 14, 15, + ); + const SH7: v128 = i8x16(0, -1, 4, -1, 1, -1, 5, -1, 2, -1, 6, -1, 3, -1, 7, -1); + + for (dst_x, &coeffs_chunk) in coefficients_chunks.iter().enumerate() { + let mut x: usize = coeffs_chunk.start as usize; + let mut sss = initial; + + let coeffs_by_8 = coeffs_chunk.values.chunks_exact(8); + let reminder8 = coeffs_by_8.remainder(); + + for k in coeffs_by_8 { + let ksource = wasm32_utils::load_v128(k, 0); + + let mut source = wasm32_utils::load_v128(src_row, x); + + let mut pix = i8x16_swizzle(source, SH1); + let mut mmk = i8x16_swizzle(ksource, SH2); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + pix = i8x16_swizzle(source, SH3); + mmk = i8x16_swizzle(ksource, SH4); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + source = wasm32_utils::load_v128(src_row, x + 4); + + pix = i8x16_swizzle(source, SH1); + mmk = i8x16_swizzle(ksource, SH5); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + pix = i8x16_swizzle(source, SH3); + mmk = i8x16_swizzle(ksource, SH6); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + x += 8; + } + + let coeffs_by_4 = reminder8.chunks_exact(4); + let reminder4 = coeffs_by_4.remainder(); + + for k in coeffs_by_4 { + let source = wasm32_utils::load_v128(src_row, x); + let ksource = wasm32_utils::loadl_i64(k, 0); + + let mut pix = i8x16_swizzle(source, SH1); + let mut mmk = i8x16_swizzle(ksource, SH2); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + pix = i8x16_swizzle(source, SH3); + mmk = i8x16_swizzle(ksource, SH4); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + x += 4; + } + + let coeffs_by_2 = reminder4.chunks_exact(2); + let reminder2 = coeffs_by_2.remainder(); + + for k in coeffs_by_2 { + let mmk = wasm32_utils::ptr_i16_to_set1_i32(k, 0); + let source = wasm32_utils::loadl_i64(src_row, x); + let pix = i8x16_swizzle(source, SH7); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + x += 2 + } + + if let Some(&k) = reminder2.first() { + let pix = wasm32_utils::i32x4_extend_low_ptr_u8x4(src_row, x); + let mmk = i32x4_splat(k as i32); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + } + + macro_rules! call { + ($imm8:expr) => {{ + sss = i32x4_shr(sss, $imm8); + }}; + } + constify_imm8!(precision, call); + + sss = i16x8_narrow_i32x4(sss, sss); + *dst_row.get_unchecked_mut(dst_x) = + transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss, sss))); + } +} diff --git a/src/convolution/vertical_u16/mod.rs b/src/convolution/vertical_u16/mod.rs index ec581d6..7064a1c 100644 --- a/src/convolution/vertical_u16/mod.rs +++ b/src/convolution/vertical_u16/mod.rs @@ -10,6 +10,8 @@ pub(crate) mod native; mod neon; #[cfg(target_arch = "x86_64")] pub(crate) mod sse4; +#[cfg(target_arch = "wasm32")] +pub(crate) mod wasm32; pub(crate) fn vert_convolution_u16>( src_image: &ImageView, @@ -29,6 +31,8 @@ pub(crate) fn vert_convolution_u16>( CpuExtensions::Sse4_1 => sse4::vert_convolution(src_image, dst_image, offset, coeffs), #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => neon::vert_convolution(src_image, dst_image, offset, coeffs), + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => wasm32::vert_convolution(src_image, dst_image, offset, coeffs), _ => native::vert_convolution(src_image, dst_image, offset, coeffs), } } diff --git a/src/convolution/vertical_u16/wasm32.rs b/src/convolution/vertical_u16/wasm32.rs new file mode 100644 index 0000000..4f3c457 --- /dev/null +++ b/src/convolution/vertical_u16/wasm32.rs @@ -0,0 +1,238 @@ +use std::arch::wasm32::*; + +use crate::convolution::optimisations::CoefficientsI32Chunk; +use crate::convolution::vertical_u16::native::convolution_by_u16; +use crate::convolution::{optimisations, Coefficients}; +use crate::pixels::PixelExt; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +pub(crate) fn vert_convolution>( + src_image: &ImageView, + dst_image: &mut ImageViewMut, + offset: u32, + coeffs: Coefficients, +) { + let normalizer = optimisations::Normalizer32::new(coeffs); + let coefficients_chunks = normalizer.normalized_chunks(); + let src_x = offset as usize * T::count_of_components(); + + let dst_rows = dst_image.iter_rows_mut(); + for (dst_row, coeffs_chunk) in dst_rows.zip(coefficients_chunks) { + unsafe { + vert_convolution_into_one_row_u16(src_image, dst_row, src_x, coeffs_chunk, &normalizer); + } + } +} + +#[target_feature(enable = "simd128")] +unsafe fn vert_convolution_into_one_row_u16>( + src_img: &ImageView, + dst_row: &mut [T], + mut src_x: usize, + coeffs_chunk: CoefficientsI32Chunk, + normalizer: &optimisations::Normalizer32, +) { + let y_start = coeffs_chunk.start; + let coeffs = coeffs_chunk.values; + let max_y = y_start + coeffs.len() as u32; + let mut dst_u16 = T::components_mut(dst_row); + + /* + |0 1 2 3 4 5 6 7 | + |0001 0203 0405 0607 0809 1011 1213 1415| + + Shuffle to extract 0-1 components as i64: + 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1 + + Shuffle to extract 2-3 components as i64: + 4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1 + + Shuffle to extract 4-5 components as i64: + 8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1 + + Shuffle to extract 6-7 components as i64: + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1 + + */ + + let c_shuffles = [ + i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1), + i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1), + i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1), + i8x16( + 12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1, + ), + ]; + + let precision = normalizer.precision(); + let initial = i64x2_splat(1 << (precision - 1)); + let mut c_buf = [0i64; 2]; + + let mut dst_chunks_16 = dst_u16.chunks_exact_mut(16); + for dst_chunk in &mut dst_chunks_16 { + let mut sums = [[initial; 2], [initial; 2], [initial; 2], [initial; 2]]; + + let mut y: u32 = 0; + let coeffs_2 = coeffs.chunks_exact(2); + let coeffs_reminder = coeffs_2.remainder(); + + for (src_rows, two_coeffs) in src_img.iter_2_rows(y_start, max_y).zip(coeffs_2) { + let src_rows = src_rows.map(|row| T::components(row)); + + for r in 0..2 { + let coeff_i64x2 = i64x2_splat(two_coeffs[r] as i64); + for x in 0..2 { + let source = wasm32_utils::load_v128(src_rows[r], src_x + x * 8); + for i in 0..4 { + let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]); + sums[i][x] = + i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2)); + } + } + } + y += 2; + } + + if let Some(&k) = coeffs_reminder.first() { + let s_row = src_img.get_row(y_start + y).unwrap(); + let components = T::components(s_row); + let coeff_i64x2 = i64x2_splat(k as i64); + + for x in 0..2 { + let source = wasm32_utils::load_v128(components, src_x + x * 8); + for i in 0..4 { + let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]); + sums[i][x] = + i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2)); + } + } + } + + let mut dst_ptr = dst_chunk.as_mut_ptr(); + for x in 0..2 { + for sum in sums { + v128_store((&mut c_buf).as_mut_ptr() as *mut v128, sum[x]); + *dst_ptr = normalizer.clip(c_buf[0]); + dst_ptr = dst_ptr.add(1); + *dst_ptr = normalizer.clip(c_buf[1]); + dst_ptr = dst_ptr.add(1); + } + } + + src_x += 16; + } + + dst_u16 = dst_chunks_16.into_remainder(); + let mut dst_chunks_8 = dst_u16.chunks_exact_mut(8); + if let Some(dst_chunk) = dst_chunks_8.next() { + let mut sums = [initial, initial, initial, initial]; + + let mut y: u32 = 0; + let coeffs_2 = coeffs.chunks_exact(2); + let coeffs_reminder = coeffs_2.remainder(); + + for (src_rows, two_coeffs) in src_img.iter_2_rows(y_start, max_y).zip(coeffs_2) { + let src_rows = src_rows.map(|row| T::components(row)); + let coeffs_i64 = [ + i64x2_splat(two_coeffs[0] as i64), + i64x2_splat(two_coeffs[1] as i64), + ]; + + for r in 0..2 { + let source = wasm32_utils::load_v128(src_rows[r], src_x); + for i in 0..4 { + let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]); + sums[i] = + i64x2_add(sums[i], wasm32_utils::i64x2_mul_lo(c_i64x2, coeffs_i64[r])); + } + } + y += 2; + } + + if let Some(&k) = coeffs_reminder.first() { + let s_row = src_img.get_row(y_start + y).unwrap(); + let components = T::components(s_row); + let coeff_i64x2 = i64x2_splat(k as i64); + let source = wasm32_utils::load_v128(components, src_x); + for i in 0..4 { + let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]); + sums[i] = i64x2_add(sums[i], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2)); + } + } + + let mut dst_ptr = dst_chunk.as_mut_ptr(); + for sum in sums { + // let mask = _mm_cmpgt_epi64(sums[i], zero); + // sums[i] = _mm_and_si128(sums[i] , mask); + // sums[i] = _mm_srl_epi64(sums[i] , precision_i64); + // _mm_packus_epi32(sums[i] , sums[i] ); + v128_store((&mut c_buf).as_mut_ptr() as *mut v128, sum); + *dst_ptr = normalizer.clip(c_buf[0]); + dst_ptr = dst_ptr.add(1); + *dst_ptr = normalizer.clip(c_buf[1]); + dst_ptr = dst_ptr.add(1); + } + + src_x += 8; + } + + dst_u16 = dst_chunks_8.into_remainder(); + let mut dst_chunks_4 = dst_u16.chunks_exact_mut(4); + if let Some(dst_chunk) = dst_chunks_4.next() { + let mut c01 = initial; + let mut c23 = initial; + let mut y: u32 = 0; + let coeffs_2 = coeffs.chunks_exact(2); + let coeffs_reminder = coeffs_2.remainder(); + + for (src_rows, two_coeffs) in src_img.iter_2_rows(y_start, max_y).zip(coeffs_2) { + let src_rows = src_rows.map(|row| T::components(row)); + let coeffs_i64 = [ + i64x2_splat(two_coeffs[0] as i64), + i64x2_splat(two_coeffs[1] as i64), + ]; + for r in 0..2 { + let comp_x4 = src_rows[r].get_unchecked(src_x..src_x + 4); + let c_i64x2 = i64x2(comp_x4[0] as i64, comp_x4[1] as i64); + c01 = i64x2_add(c01, wasm32_utils::i64x2_mul_lo(c_i64x2, coeffs_i64[r])); + let c_i64x2 = i64x2(comp_x4[2] as i64, comp_x4[3] as i64); + c23 = i64x2_add(c23, wasm32_utils::i64x2_mul_lo(c_i64x2, coeffs_i64[r])); + } + y += 2; + } + + if let Some(&k) = coeffs_reminder.first() { + let s_row = src_img.get_row(y_start + y).unwrap(); + let components = T::components(s_row); + let coeff_i64x2 = i64x2_splat(k as i64); + + let comp_x4 = components.get_unchecked(src_x..src_x + 4); + let c_i64x2 = i64x2(comp_x4[0] as i64, comp_x4[1] as i64); + c01 = i64x2_add(c01, wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2)); + let c_i64x2 = i64x2(comp_x4[2] as i64, comp_x4[3] as i64); + c23 = i64x2_add(c23, wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2)); + } + + let mut dst_ptr = dst_chunk.as_mut_ptr(); + v128_store((&mut c_buf).as_mut_ptr() as *mut v128, c01); + *dst_ptr = normalizer.clip(c_buf[0]); + dst_ptr = dst_ptr.add(1); + *dst_ptr = normalizer.clip(c_buf[1]); + dst_ptr = dst_ptr.add(1); + v128_store((&mut c_buf).as_mut_ptr() as *mut v128, c23); + *dst_ptr = normalizer.clip(c_buf[0]); + dst_ptr = dst_ptr.add(1); + *dst_ptr = normalizer.clip(c_buf[1]); + + src_x += 4; + } + + dst_u16 = dst_chunks_4.into_remainder(); + if !dst_u16.is_empty() { + let initial = 1 << (precision - 1); + convolution_by_u16( + src_img, normalizer, initial, dst_u16, src_x, y_start, coeffs, + ); + } +} diff --git a/src/convolution/vertical_u8/mod.rs b/src/convolution/vertical_u8/mod.rs index 5432fae..3d2c79e 100644 --- a/src/convolution/vertical_u8/mod.rs +++ b/src/convolution/vertical_u8/mod.rs @@ -10,6 +10,8 @@ pub(crate) mod native; mod neon; #[cfg(target_arch = "x86_64")] pub(crate) mod sse4; +#[cfg(target_arch = "wasm32")] +pub(crate) mod wasm32; pub(crate) fn vert_convolution_u8>( src_image: &ImageView, @@ -29,6 +31,8 @@ pub(crate) fn vert_convolution_u8>( CpuExtensions::Sse4_1 => sse4::vert_convolution(src_image, dst_image, offset, coeffs), #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => neon::vert_convolution(src_image, dst_image, offset, coeffs), + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => wasm32::vert_convolution(src_image, dst_image, offset, coeffs), _ => native::vert_convolution(src_image, dst_image, offset, coeffs), } } diff --git a/src/convolution/vertical_u8/wasm32.rs b/src/convolution/vertical_u8/wasm32.rs new file mode 100644 index 0000000..bb2c698 --- /dev/null +++ b/src/convolution/vertical_u8/wasm32.rs @@ -0,0 +1,291 @@ +use std::arch::wasm32::*; + +use crate::convolution::vertical_u8::native; +use crate::convolution::{optimisations, Coefficients}; +use crate::pixels::PixelExt; +use crate::wasm32_utils; +use crate::{ImageView, ImageViewMut}; + +#[inline] +pub(crate) fn vert_convolution>( + src_image: &ImageView, + dst_image: &mut ImageViewMut, + offset: u32, + coeffs: Coefficients, +) { + let normalizer = optimisations::Normalizer16::new(coeffs); + let coefficients_chunks = normalizer.normalized_chunks(); + let src_x = offset as usize * T::count_of_components(); + + let dst_rows = dst_image.iter_rows_mut(); + for (dst_row, coeffs_chunk) in dst_rows.zip(coefficients_chunks) { + unsafe { + vert_convolution_into_one_row_u8(src_image, dst_row, src_x, coeffs_chunk, &normalizer); + } + } +} + +#[target_feature(enable = "simd128")] +pub(crate) unsafe fn vert_convolution_into_one_row_u8>( + src_img: &ImageView, + dst_row: &mut [T], + mut src_x: usize, + coeffs_chunk: optimisations::CoefficientsI16Chunk, + normalizer: &optimisations::Normalizer16, +) { + const ZERO: v128 = i64x2(0, 0); + let y_start = coeffs_chunk.start; + let coeffs = coeffs_chunk.values; + let max_y = y_start + coeffs.len() as u32; + let precision = normalizer.precision(); + let mut dst_u8 = T::components_mut(dst_row); + + let initial = i32x4_splat(1 << (precision - 1)); + + let mut dst_chunks_32 = dst_u8.chunks_exact_mut(32); + for dst_chunk in &mut dst_chunks_32 { + let mut sss0 = initial; + let mut sss1 = initial; + let mut sss2 = initial; + let mut sss3 = initial; + let mut sss4 = initial; + let mut sss5 = initial; + let mut sss6 = initial; + let mut sss7 = initial; + + let mut y: u32 = 0; + + for src_rows in src_img.iter_2_rows(y_start, max_y) { + let components1 = T::components(src_rows[0]); + let components2 = T::components(src_rows[1]); + + // Load two coefficients at once + let mmk = wasm32_utils::ptr_i16_to_set1_i32(coeffs, y as usize); + + let source1 = wasm32_utils::load_v128(components1, src_x); // top line + let source2 = wasm32_utils::load_v128(components2, src_x); // bottom line + + let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + source1, source2, + ); + let pix = i16x8_extend_low_u8x16(source); + sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk)); + let pix = i16x8_extend_high_u8x16(source); + sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk)); + + let source = + i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( + source1, source2, + ); + let pix = i16x8_extend_low_u8x16(source); + sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk)); + let pix = i16x8_extend_high_u8x16(source); + sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk)); + + let source1 = wasm32_utils::load_v128(components1, src_x + 16); // top line + let source2 = wasm32_utils::load_v128(components2, src_x + 16); // bottom line + + let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + source1, source2, + ); + let pix = i16x8_extend_low_u8x16(source); + sss4 = i32x4_add(sss4, i32x4_dot_i16x8(pix, mmk)); + let pix = i16x8_extend_high_u8x16(source); + sss5 = i32x4_add(sss5, i32x4_dot_i16x8(pix, mmk)); + + let source = + i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( + source1, source2, + ); + let pix = i16x8_extend_low_u8x16(source); + sss6 = i32x4_add(sss6, i32x4_dot_i16x8(pix, mmk)); + let pix = i16x8_extend_high_u8x16(source); + sss7 = i32x4_add(sss7, i32x4_dot_i16x8(pix, mmk)); + + y += 2; + } + + if let Some(&k) = coeffs.get(y as usize) { + let s_row = src_img.get_row(y_start + y).unwrap(); + let components = T::components(s_row); + let mmk = i32x4_splat(k as i32); + + let source1 = wasm32_utils::load_v128(components, src_x); // top line + + let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + source1, ZERO, + ); + let pix = i16x8_extend_low_u8x16(source); + sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk)); + let pix = i16x8_extend_high_u8x16(source); + sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk)); + + let source = i16x8_extend_high_u8x16(source1); + let pix = i16x8_extend_low_u8x16(source); + sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk)); + let pix = i16x8_extend_high_u8x16(source); + sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk)); + + let source1 = wasm32_utils::load_v128(components, src_x + 16); // top line + + let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + source1, ZERO, + ); + let pix = i16x8_extend_low_u8x16(source); + sss4 = i32x4_add(sss4, i32x4_dot_i16x8(pix, mmk)); + let pix = i16x8_extend_high_u8x16(source); + sss5 = i32x4_add(sss5, i32x4_dot_i16x8(pix, mmk)); + + let source = i16x8_extend_high_u8x16(source1); + let pix = i16x8_extend_low_u8x16(source); + sss6 = i32x4_add(sss6, i32x4_dot_i16x8(pix, mmk)); + let pix = i16x8_extend_high_u8x16(source); + sss7 = i32x4_add(sss7, i32x4_dot_i16x8(pix, mmk)); + } + + macro_rules! call { + ($imm8:expr) => {{ + sss0 = i32x4_shr(sss0, $imm8); + sss1 = i32x4_shr(sss1, $imm8); + sss2 = i32x4_shr(sss2, $imm8); + sss3 = i32x4_shr(sss3, $imm8); + sss4 = i32x4_shr(sss4, $imm8); + sss5 = i32x4_shr(sss5, $imm8); + sss6 = i32x4_shr(sss6, $imm8); + sss7 = i32x4_shr(sss7, $imm8); + }}; + } + constify_imm8!(precision, call); + + sss0 = i16x8_narrow_i32x4(sss0, sss1); + sss2 = i16x8_narrow_i32x4(sss2, sss3); + sss0 = u8x16_narrow_i16x8(sss0, sss2); + let dst_ptr = dst_chunk.as_mut_ptr() as *mut v128; + v128_store(dst_ptr, sss0); + sss4 = i16x8_narrow_i32x4(sss4, sss5); + sss6 = i16x8_narrow_i32x4(sss6, sss7); + sss4 = u8x16_narrow_i16x8(sss4, sss6); + let dst_ptr = dst_ptr.add(1); + v128_store(dst_ptr, sss4); + + src_x += 32; + } + + dst_u8 = dst_chunks_32.into_remainder(); + let mut dst_chunks_8 = dst_u8.chunks_exact_mut(8); + for dst_chunk in &mut dst_chunks_8 { + let mut sss0 = initial; // left row + let mut sss1 = initial; // right row + let mut y: u32 = 0; + + for src_rows in src_img.iter_2_rows(y_start, max_y) { + let components1 = T::components(src_rows[0]); + let components2 = T::components(src_rows[1]); + // Load two coefficients at once + let mmk = wasm32_utils::ptr_i16_to_set1_i32(coeffs, y as usize); + + let source1 = wasm32_utils::loadl_i64(components1, src_x); // top line + let source2 = wasm32_utils::loadl_i64(components2, src_x); // bottom line + + let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + source1, source2, + ); + let pix = i16x8_extend_low_u8x16(source); + sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk)); + let pix = i16x8_extend_high_u8x16(source); + sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk)); + + y += 2; + } + + if let Some(&k) = coeffs.get(y as usize) { + let s_row = src_img.get_row(y_start + y).unwrap(); + let components = T::components(s_row); + let mmk = i32x4_splat(k as i32); + + let source1 = wasm32_utils::loadl_i64(components, src_x); // top line + + let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + source1, ZERO, + ); + let pix = i16x8_extend_low_u8x16(source); + sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk)); + let pix = i16x8_extend_high_u8x16(source); + sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk)); + } + + macro_rules! call { + ($imm8:expr) => {{ + sss0 = i32x4_shr(sss0, $imm8); + sss1 = i32x4_shr(sss1, $imm8); + }}; + } + constify_imm8!(precision, call); + + sss0 = i16x8_narrow_i32x4(sss0, sss1); + sss0 = u8x16_narrow_i16x8(sss0, sss0); + let dst_ptr = dst_chunk.as_mut_ptr() as *mut [i64; 2]; + (*dst_ptr)[0] = i64x2_extract_lane::<0>(sss0); + + src_x += 8; + } + + dst_u8 = dst_chunks_8.into_remainder(); + let mut dst_chunks_4 = dst_u8.chunks_exact_mut(4); + if let Some(dst_chunk) = dst_chunks_4.next() { + let mut sss = initial; + let mut y: u32 = 0; + + for src_rows in src_img.iter_2_rows(y_start, max_y) { + let components1 = T::components(src_rows[0]); + let components2 = T::components(src_rows[1]); + // Load two coefficients at once + let mmk = wasm32_utils::ptr_i16_to_set1_i32(coeffs, y as usize); + + let source1 = wasm32_utils::i32x4_v128_from_u8(components1, src_x); // top line + let source2 = wasm32_utils::i32x4_v128_from_u8(components2, src_x); // bottom line + + let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( + source1, source2, + ); + let pix = i16x8_extend_low_u8x16(source); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + + y += 2; + } + + if let Some(&k) = coeffs.get(y as usize) { + let s_row = src_img.get_row(y_start + y).unwrap(); + let components = T::components(s_row); + let pix = wasm32_utils::i32x4_extend_low_ptr_u8(components, src_x); + let mmk = i32x4_splat(k as i32); + sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk)); + } + + macro_rules! call { + ($imm8:expr) => {{ + sss = i32x4_shr(sss, $imm8); + }}; + } + constify_imm8!(precision, call); + + sss = i16x8_narrow_i32x4(sss, sss); + let dst_ptr = dst_chunk.as_mut_ptr() as *mut i32; + *dst_ptr = i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss, sss)); + + src_x += 4; + } + + dst_u8 = dst_chunks_4.into_remainder(); + if !dst_u8.is_empty() { + native::convolution_by_u8( + src_img, + normalizer, + 1 << (precision - 1), + dst_u8, + src_x, + y_start, + coeffs, + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index 4563ab8..e0e3278 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,3 +34,5 @@ mod resizer; mod simd_utils; #[cfg(feature = "for_test")] pub mod testing; +#[cfg(target_arch = "wasm32")] +mod wasm32_utils; diff --git a/src/resizer.rs b/src/resizer.rs index 549d735..068e3b2 100644 --- a/src/resizer.rs +++ b/src/resizer.rs @@ -16,6 +16,8 @@ pub enum CpuExtensions { Avx2, #[cfg(target_arch = "aarch64")] Neon, + #[cfg(target_arch = "wasm32")] + Wasm32, } impl CpuExtensions { @@ -28,6 +30,8 @@ impl CpuExtensions { Self::Sse4_1 => is_x86_feature_detected!("sse4.1"), #[cfg(target_arch = "aarch64")] Self::Neon => true, + #[cfg(target_arch = "wasm32")] + Self::Wasm32 => true, Self::None => true, } } @@ -54,8 +58,16 @@ impl Default for CpuExtensions { Self::None } } + #[cfg(target_arch = "wasm32")] + fn default() -> Self { + Self::Wasm32 + } - #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] + #[cfg(not(any( + target_arch = "x86_64", + target_arch = "aarch64", + target_arch = "wasm32" + )))] fn default() -> Self { Self::None } diff --git a/src/wasm32_utils.rs b/src/wasm32_utils.rs new file mode 100644 index 0000000..afb5e2e --- /dev/null +++ b/src/wasm32_utils.rs @@ -0,0 +1,73 @@ +use crate::pixels::{U8x3, U8x4}; +use std::arch::wasm32::*; +use std::intrinsics::transmute; + +#[inline(always)] +pub unsafe fn load_v128(buf: &[T], index: usize) -> v128 { + v128_load(buf.get_unchecked(index..).as_ptr() as *const v128) +} + +#[inline(always)] +pub unsafe fn loadl_i64(buf: &[T], index: usize) -> v128 { + let i = buf.get_unchecked(index..).as_ptr() as *const i64; + i64x2(*i, 0) +} + +#[inline(always)] +pub unsafe fn loadl_i32(buf: &[T], index: usize) -> v128 { + let i = buf.get_unchecked(index..).as_ptr() as *const i32; + i32x4(*i, 0, 0, 0) +} + +#[inline(always)] +pub unsafe fn loadl_i16(buf: &[T], index: usize) -> v128 { + let i = buf.get_unchecked(index..).as_ptr() as *const i16; + i16x8(*i, 0, 0, 0, 0, 0, 0, 0) +} + +#[inline(always)] +pub unsafe fn ptr_i16_to_set1_i64(buf: &[i16], index: usize) -> v128 { + i64x2_splat(*(buf.get_unchecked(index..).as_ptr() as *const i64)) +} + +#[inline(always)] +pub unsafe fn ptr_i16_to_set1_i32(buf: &[i16], index: usize) -> v128 { + i32x4_splat(*(buf.get_unchecked(index..).as_ptr() as *const i32)) +} + +#[inline(always)] +pub unsafe fn i32x4_extend_low_ptr_u8(buf: &[u8], index: usize) -> v128 { + let ptr = buf.get_unchecked(index..).as_ptr() as *const v128; + u32x4_extend_low_u16x8(i16x8_extend_low_u8x16(v128_load(ptr))) +} + +#[inline(always)] +pub unsafe fn i32x4_extend_low_ptr_u8x4(buf: &[U8x4], index: usize) -> v128 { + let v: u32 = transmute(buf.get_unchecked(index).0); + u32x4_extend_low_u16x8(i16x8_extend_low_u8x16(u32x4(v, 0, 0, 0))) +} + +#[inline(always)] +pub unsafe fn i32x4_extend_low_ptr_u8x3(buf: &[U8x3], index: usize) -> v128 { + let pixel = buf.get_unchecked(index).0; + i32x4(pixel[0] as i32, pixel[1] as i32, pixel[2] as i32, 0) +} + +#[inline(always)] +pub unsafe fn i32x4_v128_from_u8(buf: &[u8], index: usize) -> v128 { + let ptr = buf.get_unchecked(index..).as_ptr() as *const i32; + i32x4(*ptr, 0, 0, 0) +} + +#[inline(always)] +pub unsafe fn u16x8_mul_hi(a: v128, b: v128) -> v128 { + let lo = u32x4_extmul_low_u16x8(a, b); + let hi = u32x4_extmul_high_u16x8(a, b); + i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(lo, hi) +} + +#[inline(always)] +pub unsafe fn i64x2_mul_lo(a: v128, b: v128) -> v128 { + const SHUFFLE: v128 = i8x16(0, 1, 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1); + i64x2_extmul_low_i32x4(i8x16_swizzle(a, SHUFFLE), i8x16_swizzle(b, SHUFFLE)) +} diff --git a/testing/src/lib.rs b/testing/src/lib.rs index b70976b..7ad204e 100644 --- a/testing/src/lib.rs +++ b/testing/src/lib.rs @@ -272,5 +272,7 @@ pub fn cpu_ext_into_str(cpu_extensions: CpuExtensions) -> &'static str { CpuExtensions::Avx2 => "avx2", #[cfg(target_arch = "aarch64")] CpuExtensions::Neon => "neon", + #[cfg(target_arch = "wasm32")] + CpuExtensions::Wasm32 => "wasm32", } } diff --git a/tests/alpha_tests.rs b/tests/alpha_tests.rs index 1464df9..81348ea 100644 --- a/tests/alpha_tests.rs +++ b/tests/alpha_tests.rs @@ -154,6 +154,12 @@ mod multiply_alpha_u8x4 { mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + #[test] + fn wasm32_test() { + mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32); + } + #[test] fn native_test() { mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::None); @@ -206,6 +212,12 @@ mod multiply_alpha_u8x2 { mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + #[test] + fn wasm32_test() { + mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32); + } + #[test] fn native_test() { mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::None); @@ -258,6 +270,12 @@ mod multiply_alpha_u16x2 { mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + #[test] + fn wasm32_test() { + mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32); + } + #[test] fn native_test() { mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::None); @@ -298,6 +316,12 @@ mod multiply_alpha_u16x4 { mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + #[test] + fn wasm32_test() { + mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32); + } + #[test] fn native_test() { mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::None); @@ -336,6 +360,12 @@ mod divide_alpha_u8x4 { mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + #[test] + fn wasm32_test() { + mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32); + } + #[test] fn native_test() { mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::None); @@ -388,6 +418,12 @@ mod divide_alpha_u8x2 { mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + #[test] + fn wasm32_test() { + mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32); + } + #[test] fn native_test() { mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::None); @@ -451,6 +487,12 @@ mod divide_alpha_u16x2 { mul_div_alpha_test(OPER, SRC_PIXELS, SIMD_RES_PIXELS, CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + #[test] + fn wasm32_test() { + mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32); + } + #[test] fn native_test() { mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::None); @@ -496,6 +538,12 @@ mod divide_alpha_u16x4 { mul_div_alpha_test(OPER, SRC_PIXELS, SIMD_RES_PIXELS, CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + #[test] + fn wasm32_test() { + mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32); + } + #[test] fn native_test() { mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::None); diff --git a/tests/resize_tests.rs b/tests/resize_tests.rs index 4783bcd..7f7cca1 100644 --- a/tests/resize_tests.rs +++ b/tests/resize_tests.rs @@ -203,6 +203,10 @@ fn resize_to_same_width_after_cropping() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { if !cpu_extensions.is_supported() { continue; @@ -376,6 +380,10 @@ fn downscale_u8() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::downscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -400,6 +408,10 @@ fn upscale_u8() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::upscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -424,6 +436,10 @@ fn downscale_u8x2() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::downscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -452,6 +468,10 @@ fn upscale_u8x2() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::upscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -480,6 +500,10 @@ fn downscale_u8x3() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::downscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -508,6 +532,10 @@ fn upscale_u8x3() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::upscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -536,6 +564,10 @@ fn downscale_u8x4() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::downscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -570,6 +602,10 @@ fn upscale_u8x4() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::upscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -594,6 +630,10 @@ fn downscale_u16() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::downscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -618,6 +658,10 @@ fn upscale_u16() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::upscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -646,6 +690,10 @@ fn downscale_u16x2() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::downscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -674,6 +722,10 @@ fn upscale_u16x2() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::upscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -702,6 +754,10 @@ fn downscale_u16x3() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::downscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -730,6 +786,10 @@ fn upscale_u16x3() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::upscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -758,6 +818,10 @@ fn downscale_u16x4() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::downscale_test( ResizeAlg::Convolution(FilterType::Lanczos3), @@ -786,6 +850,10 @@ fn upscale_u16x4() { { cpu_extensions_vec.push(CpuExtensions::Neon); } + #[cfg(target_arch = "wasm32")] + { + cpu_extensions_vec.push(CpuExtensions::Wasm32); + } for cpu_extensions in cpu_extensions_vec { P::upscale_test( ResizeAlg::Convolution(FilterType::Lanczos3),