diff --git a/src/alpha/u16x2/mod.rs b/src/alpha/u16x2/mod.rs
index 161c17a..64c203e 100644
--- a/src/alpha/u16x2/mod.rs
+++ b/src/alpha/u16x2/mod.rs
@@ -11,6 +11,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl AlphaMulDiv for U16x2 {
     fn multiply_alpha(
@@ -25,6 +27,8 @@ impl AlphaMulDiv for U16x2 {
             CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_image, dst_image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_image, dst_image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha(src_image, dst_image) },
             _ => native::multiply_alpha(src_image, dst_image),
         }
     }
@@ -37,6 +41,8 @@ impl AlphaMulDiv for U16x2 {
             CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha_inplace(image) },
             _ => native::multiply_alpha_inplace(image),
         }
     }
@@ -53,6 +59,8 @@ impl AlphaMulDiv for U16x2 {
             CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_image, dst_image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::divide_alpha(src_image, dst_image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha(src_image, dst_image) },
             _ => native::divide_alpha(src_image, dst_image),
         }
     }
@@ -65,6 +73,8 @@ impl AlphaMulDiv for U16x2 {
             CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha_inplace(image) },
             _ => native::divide_alpha_inplace(image),
         }
     }
diff --git a/src/alpha/u16x2/wasm32.rs b/src/alpha/u16x2/wasm32.rs
new file mode 100644
index 0000000..aa18633
--- /dev/null
+++ b/src/alpha/u16x2/wasm32.rs
@@ -0,0 +1,212 @@
+use std::arch::wasm32::*;
+
+use crate::pixels::U16x2;
+use crate::utils::foreach_with_pre_reading;
+use crate::{ImageView, ImageViewMut};
+
+use super::native;
+
+pub(crate) unsafe fn multiply_alpha(
+    src_image: &ImageView<U16x2>,
+    dst_image: &mut ImageViewMut<U16x2>,
+) {
+    let src_rows = src_image.iter_rows(0);
+    let dst_rows = dst_image.iter_rows_mut();
+
+    for (src_row, dst_row) in src_rows.zip(dst_rows) {
+        multiply_alpha_row(src_row, dst_row);
+    }
+}
+
+pub(crate) unsafe fn multiply_alpha_inplace(image: &mut ImageViewMut<U16x2>) {
+    for row in image.iter_rows_mut() {
+        multiply_alpha_row_inplace(row);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn multiply_alpha_row(src_row: &[U16x2], dst_row: &mut [U16x2]) {
+    let src_chunks = src_row.chunks_exact(4);
+    let src_remainder = src_chunks.remainder();
+    let mut dst_chunks = dst_row.chunks_exact_mut(4);
+    let src_dst = src_chunks.zip(&mut dst_chunks);
+    foreach_with_pre_reading(
+        src_dst,
+        |(src, dst)| {
+            let pixels = v128_load(src.as_ptr() as *const v128);
+            let dst_ptr = dst.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = multiplies_alpha_4_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    if !src_remainder.is_empty() {
+        let dst_reminder = dst_chunks.into_remainder();
+        native::multiply_alpha_row(src_remainder, dst_reminder);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn multiply_alpha_row_inplace(row: &mut [U16x2]) {
+    let mut chunks = row.chunks_exact_mut(4);
+    foreach_with_pre_reading(
+        &mut chunks,
+        |chunk| {
+            let pixels = v128_load(chunk.as_ptr() as *const v128);
+            let dst_ptr = chunk.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = multiplies_alpha_4_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    let reminder = chunks.into_remainder();
+    if !reminder.is_empty() {
+        native::multiply_alpha_row_inplace(reminder);
+    }
+}
+
+#[inline]
+unsafe fn multiplies_alpha_4_pixels(pixels: v128) -> v128 {
+    const HALF: v128 = i32x4(0x8000, 0x8000, 0x8000, 0x8000);
+
+    const MAX_ALPHA: v128 = u32x4(0xffff0000u32, 0xffff0000u32, 0xffff0000u32, 0xffff0000u32);
+    /*
+       |L0   A0  | |L1   A1  | |L2   A2  | |L3   A3  |
+       |0001 0203| |0405 0607| |0809 1011| |1213 1415|
+    */
+    const FACTOR_MASK: v128 = i8x16(2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15);
+
+    let factor_pixels = u8x16_swizzle(pixels, FACTOR_MASK);
+    let factor_pixels = v128_or(factor_pixels, MAX_ALPHA);
+
+    let src_u32_lo = u32x4_extend_low_u16x8(pixels);
+    let factors = u32x4_extend_low_u16x8(factor_pixels);
+    let src_i32_lo = i32x4_add(i32x4_mul(src_u32_lo, factors), HALF);
+    let dst_i32_lo = i32x4_add(src_i32_lo, u32x4_shr(src_i32_lo, 16));
+    let dst_i32_lo = u32x4_shr(dst_i32_lo, 16);
+
+    let src_u32_hi = u32x4_extend_high_u16x8(pixels);
+    let factors = u32x4_extend_high_u16x8(factor_pixels);
+    let src_i32_hi = i32x4_add(i32x4_mul(src_u32_hi, factors), HALF);
+    let dst_i32_hi = i32x4_add(src_i32_hi, u32x4_shr(src_i32_hi, 16));
+    let dst_i32_hi = u32x4_shr(dst_i32_hi, 16);
+
+    u16x8_narrow_i32x4(dst_i32_lo, dst_i32_hi)
+}
+
+// Divide
+
+pub(crate) unsafe fn divide_alpha(
+    src_image: &ImageView<U16x2>,
+    dst_image: &mut ImageViewMut<U16x2>,
+) {
+    let src_rows = src_image.iter_rows(0);
+    let dst_rows = dst_image.iter_rows_mut();
+
+    for (src_row, dst_row) in src_rows.zip(dst_rows) {
+        divide_alpha_row(src_row, dst_row);
+    }
+}
+
+pub(crate) unsafe fn divide_alpha_inplace(image: &mut ImageViewMut<U16x2>) {
+    for row in image.iter_rows_mut() {
+        divide_alpha_row_inplace(row);
+    }
+}
+
+pub(crate) unsafe fn divide_alpha_row(src_row: &[U16x2], dst_row: &mut [U16x2]) {
+    let src_chunks = src_row.chunks_exact(4);
+    let src_remainder = src_chunks.remainder();
+    let mut dst_chunks = dst_row.chunks_exact_mut(4);
+    let src_dst = src_chunks.zip(&mut dst_chunks);
+    foreach_with_pre_reading(
+        src_dst,
+        |(src, dst)| {
+            let pixels = v128_load(src.as_ptr() as *const v128);
+            let dst_ptr = dst.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = divide_alpha_4_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    if !src_remainder.is_empty() {
+        let dst_reminder = dst_chunks.into_remainder();
+        let mut src_pixels = [U16x2::new([0, 0]); 4];
+        src_pixels
+            .iter_mut()
+            .zip(src_remainder)
+            .for_each(|(d, s)| *d = *s);
+
+        let mut dst_pixels = [U16x2::new([0, 0]); 4];
+        let mut pixels = v128_load(src_pixels.as_ptr() as *const v128);
+        pixels = divide_alpha_4_pixels(pixels);
+        v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels);
+
+        dst_pixels
+            .iter()
+            .zip(dst_reminder)
+            .for_each(|(s, d)| *d = *s);
+    }
+}
+
+pub(crate) unsafe fn divide_alpha_row_inplace(row: &mut [U16x2]) {
+    let mut chunks = row.chunks_exact_mut(4);
+    // Using a simple for-loop in this case is faster than implementation with pre-reading
+    for chunk in &mut chunks {
+        let mut pixels = v128_load(chunk.as_ptr() as *const v128);
+        pixels = divide_alpha_4_pixels(pixels);
+        v128_store(chunk.as_mut_ptr() as *mut v128, pixels);
+    }
+
+    let reminder = chunks.into_remainder();
+    if !reminder.is_empty() {
+        let mut src_pixels = [U16x2::new([0, 0]); 4];
+        src_pixels
+            .iter_mut()
+            .zip(reminder.iter())
+            .for_each(|(d, s)| *d = *s);
+
+        let mut dst_pixels = [U16x2::new([0, 0]); 4];
+        let mut pixels = v128_load(src_pixels.as_ptr() as *const v128);
+        pixels = divide_alpha_4_pixels(pixels);
+        v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels);
+
+        dst_pixels.iter().zip(reminder).for_each(|(s, d)| *d = *s);
+    }
+}
+
+#[inline]
+unsafe fn divide_alpha_4_pixels(pixels: v128) -> v128 {
+    const ALPHA_MASK: v128 = u32x4(0xffff0000u32, 0xffff0000u32, 0xffff0000u32, 0xffff0000u32);
+    const LUMA_MASK: v128 = i32x4(0xffff, 0xffff, 0xffff, 0xffff);
+    const ALPHA_MAX: v128 = f32x4(65535.0, 65535.0, 65535.0, 65535.0);
+    const ALPHA_SCALE_MAX: v128 = f32x4(2147483648f32, 2147483648f32, 2147483648f32, 2147483648f32);
+    /*
+       |L0   A0  | |L1   A1  | |L2   A2  | |L3   A3  |
+       |0001 0203| |0405 0607| |0809 1011| |1213 1415|
+    */
+    const ALPHA32_SH: v128 = i8x16(2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1);
+
+    let alpha_f32x4 = f32x4_convert_i32x4(u8x16_swizzle(pixels, ALPHA32_SH));
+    let luma_f32x4 = f32x4_convert_i32x4(v128_and(pixels, LUMA_MASK));
+    let scaled_luma_f32x4 = f32x4_mul(luma_f32x4, ALPHA_MAX);
+    let divided_luma_u32x4 = u32x4_trunc_sat_f32x4(f32x4_pmin(
+        f32x4_div(scaled_luma_f32x4, alpha_f32x4),
+        ALPHA_SCALE_MAX,
+    ));
+
+    let alpha = v128_and(pixels, ALPHA_MASK);
+    u8x16_shuffle::<0, 1, 18, 19, 4, 5, 22, 23, 8, 9, 26, 27, 12, 13, 30, 31>(
+        divided_luma_u32x4,
+        alpha,
+    )
+}
diff --git a/src/alpha/u16x4/mod.rs b/src/alpha/u16x4/mod.rs
index 2ea82f2..9b90d8f 100644
--- a/src/alpha/u16x4/mod.rs
+++ b/src/alpha/u16x4/mod.rs
@@ -11,6 +11,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl AlphaMulDiv for U16x4 {
     fn multiply_alpha(
@@ -25,6 +27,8 @@ impl AlphaMulDiv for U16x4 {
             CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_image, dst_image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_image, dst_image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha(src_image, dst_image) },
             _ => native::multiply_alpha(src_image, dst_image),
         }
     }
@@ -37,6 +41,8 @@ impl AlphaMulDiv for U16x4 {
             CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha_inplace(image) },
             _ => native::multiply_alpha_inplace(image),
         }
     }
@@ -53,6 +59,8 @@ impl AlphaMulDiv for U16x4 {
             CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_image, dst_image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::divide_alpha(src_image, dst_image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha(src_image, dst_image) },
             _ => native::divide_alpha(src_image, dst_image),
         }
     }
@@ -65,6 +73,8 @@ impl AlphaMulDiv for U16x4 {
             CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha_inplace(image) },
             _ => native::divide_alpha_inplace(image),
         }
     }
diff --git a/src/alpha/u16x4/wasm32.rs b/src/alpha/u16x4/wasm32.rs
new file mode 100644
index 0000000..7b68f44
--- /dev/null
+++ b/src/alpha/u16x4/wasm32.rs
@@ -0,0 +1,220 @@
+use std::arch::wasm32::*;
+
+use crate::pixels::U16x4;
+use crate::utils::foreach_with_pre_reading;
+use crate::{ImageView, ImageViewMut};
+
+use super::native;
+
+pub(crate) unsafe fn multiply_alpha(
+    src_image: &ImageView<U16x4>,
+    dst_image: &mut ImageViewMut<U16x4>,
+) {
+    let src_rows = src_image.iter_rows(0);
+    let dst_rows = dst_image.iter_rows_mut();
+
+    for (src_row, dst_row) in src_rows.zip(dst_rows) {
+        multiply_alpha_row(src_row, dst_row);
+    }
+}
+
+pub(crate) unsafe fn multiply_alpha_inplace(image: &mut ImageViewMut<U16x4>) {
+    for row in image.iter_rows_mut() {
+        multiply_alpha_row_inplace(row);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn multiply_alpha_row(src_row: &[U16x4], dst_row: &mut [U16x4]) {
+    let src_chunks = src_row.chunks_exact(2);
+    let src_remainder = src_chunks.remainder();
+    let mut dst_chunks = dst_row.chunks_exact_mut(2);
+    let src_dst = src_chunks.zip(&mut dst_chunks);
+    foreach_with_pre_reading(
+        src_dst,
+        |(src, dst)| {
+            let pixels = v128_load(src.as_ptr() as *const v128);
+            let dst_ptr = dst.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = multiply_alpha_2_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    if !src_remainder.is_empty() {
+        let dst_reminder = dst_chunks.into_remainder();
+        native::multiply_alpha_row(src_remainder, dst_reminder);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn multiply_alpha_row_inplace(row: &mut [U16x4]) {
+    let mut chunks = row.chunks_exact_mut(2);
+    foreach_with_pre_reading(
+        &mut chunks,
+        |chunk| {
+            let pixels = v128_load(chunk.as_ptr() as *const v128);
+            let dst_ptr = chunk.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = multiply_alpha_2_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    let remainder = chunks.into_remainder();
+    if !remainder.is_empty() {
+        native::multiply_alpha_row_inplace(remainder);
+    }
+}
+
+#[inline]
+unsafe fn multiply_alpha_2_pixels(pixels: v128) -> v128 {
+    let zero = i64x2_splat(0);
+    let half = i32x4_splat(0x8000);
+    const MAX_A: i64 = 0xffff000000000000u64 as i64;
+    let max_alpha = i64x2_splat(MAX_A);
+    /*
+       |R0   G0   B0   A0  | |R1   G1   B1   A1  |
+       |0001 0203 0405 0607| |0809 1011 1213 1415|
+    */
+    const FACTOR_MASK: v128 = i8x16(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15);
+
+    let factor_pixels = u8x16_swizzle(pixels, FACTOR_MASK);
+    let factor_pixels = v128_or(factor_pixels, max_alpha);
+
+    let src_i32_lo = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(pixels, zero);
+    let factors = i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(factor_pixels, zero);
+    let src_i32_lo = i32x4_add(i32x4_mul(src_i32_lo, factors), half);
+    let dst_i32_lo = i32x4_add(src_i32_lo, u32x4_shr(src_i32_lo, 16));
+    let dst_i32_lo = u32x4_shr(dst_i32_lo, 16);
+
+    let src_i32_hi = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(pixels, zero);
+    let factors = i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(factor_pixels, zero);
+    let src_i32_hi = i32x4_add(i32x4_mul(src_i32_hi, factors), half);
+    let dst_i32_hi = i32x4_add(src_i32_hi, u32x4_shr(src_i32_hi, 16));
+    let dst_i32_hi = u32x4_shr(dst_i32_hi, 16);
+
+    u16x8_narrow_i32x4(dst_i32_lo, dst_i32_hi)
+}
+
+// Divide
+
+pub(crate) unsafe fn divide_alpha(
+    src_image: &ImageView<U16x4>,
+    dst_image: &mut ImageViewMut<U16x4>,
+) {
+    let src_rows = src_image.iter_rows(0);
+    let dst_rows = dst_image.iter_rows_mut();
+
+    for (src_row, dst_row) in src_rows.zip(dst_rows) {
+        divide_alpha_row(src_row, dst_row);
+    }
+}
+
+pub(crate) unsafe fn divide_alpha_inplace(image: &mut ImageViewMut<U16x4>) {
+    for row in image.iter_rows_mut() {
+        divide_alpha_row_inplace(row);
+    }
+}
+
+pub(crate) unsafe fn divide_alpha_row(src_row: &[U16x4], dst_row: &mut [U16x4]) {
+    let src_chunks = src_row.chunks_exact(2);
+    let src_remainder = src_chunks.remainder();
+    let mut dst_chunks = dst_row.chunks_exact_mut(2);
+    let src_dst = src_chunks.zip(&mut dst_chunks);
+    foreach_with_pre_reading(
+        src_dst,
+        |(src, dst)| {
+            let pixels = v128_load(src.as_ptr() as *const v128);
+            let dst_ptr = dst.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = divide_alpha_2_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    if let Some(src) = src_remainder.first() {
+        let src_pixels = [*src, U16x4::new([0, 0, 0, 0])];
+        let mut dst_pixels = [U16x4::new([0, 0, 0, 0]); 2];
+
+        let mut pixels = v128_load(src_pixels.as_ptr() as *const v128);
+        pixels = divide_alpha_2_pixels(pixels);
+        v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels);
+
+        let dst_reminder = dst_chunks.into_remainder();
+        if let Some(dst) = dst_reminder.get_mut(0) {
+            *dst = dst_pixels[0];
+        }
+    }
+}
+
+pub(crate) unsafe fn divide_alpha_row_inplace(row: &mut [U16x4]) {
+    let mut chunks = row.chunks_exact_mut(2);
+    foreach_with_pre_reading(
+        &mut chunks,
+        |chunk| {
+            let pixels = v128_load(chunk.as_ptr() as *const v128);
+            let dst_ptr = chunk.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = divide_alpha_2_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    let reminder = chunks.into_remainder();
+    if let Some(pixel) = reminder.first_mut() {
+        let src_pixels = [*pixel, U16x4::new([0, 0, 0, 0])];
+        let mut dst_pixels = [U16x4::new([0, 0, 0, 0]); 2];
+
+        let mut pixels = v128_load(src_pixels.as_ptr() as *const v128);
+        pixels = divide_alpha_2_pixels(pixels);
+        v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels);
+        *pixel = dst_pixels[0];
+    }
+}
+
+#[inline]
+unsafe fn divide_alpha_2_pixels(pixels: v128) -> v128 {
+    let zero = i64x2_splat(0);
+    let alpha_mask = i64x2_splat(0xffff000000000000u64 as i64);
+    let alpha_max = f32x4_splat(65535.0);
+    let alpha_scale_max = f32x4_splat(2147483648f32);
+    /*
+       |R0   G0   B0   A0  | |R1   G1   B1   A1  |
+       |0001 0203 0405 0607| |0809 1011 1213 1415|
+    */
+    const ALPHA32_SH0: v128 = i8x16(6, 7, -1, -1, 6, 7, -1, -1, 6, 7, -1, -1, 6, 7, -1, -1);
+    const ALPHA32_SH1: v128 = i8x16(
+        14, 15, -1, -1, 14, 15, -1, -1, 14, 15, -1, -1, 14, 15, -1, -1,
+    );
+
+    let alpha0_f32x4 = f32x4_convert_i32x4(u8x16_swizzle(pixels, ALPHA32_SH0));
+    let alpha1_f32x4 = f32x4_convert_i32x4(u8x16_swizzle(pixels, ALPHA32_SH1));
+
+    let pix0_f32x4 = f32x4_convert_i32x4(i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(pixels, zero));
+    let pix1_f32x4 = f32x4_convert_i32x4(i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(pixels, zero));
+
+    let scaled_pix0_f32x4 = f32x4_mul(pix0_f32x4, alpha_max);
+    let scaled_pix1_f32x4 = f32x4_mul(pix1_f32x4, alpha_max);
+
+    let divided_pix0_i32x4 = u32x4_trunc_sat_f32x4(f32x4_pmin(
+        f32x4_div(scaled_pix0_f32x4, alpha0_f32x4),
+        alpha_scale_max,
+    ));
+    let divided_pix1_i32x4 = u32x4_trunc_sat_f32x4(f32x4_pmin(
+        f32x4_div(scaled_pix1_f32x4, alpha1_f32x4),
+        alpha_scale_max,
+    ));
+
+    let two_pixels_i16x8 = u16x8_narrow_i32x4(divided_pix0_i32x4, divided_pix1_i32x4);
+    let alpha = v128_and(pixels, alpha_mask);
+    u8x16_shuffle::<0, 1, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 12, 13, 30, 31>(two_pixels_i16x8, alpha)
+}
diff --git a/src/alpha/u8x2/mod.rs b/src/alpha/u8x2/mod.rs
index 1014d97..11dbee5 100644
--- a/src/alpha/u8x2/mod.rs
+++ b/src/alpha/u8x2/mod.rs
@@ -11,6 +11,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl AlphaMulDiv for U8x2 {
     fn multiply_alpha(
@@ -25,6 +27,8 @@ impl AlphaMulDiv for U8x2 {
             CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_image, dst_image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_image, dst_image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha(src_image, dst_image) },
             _ => native::multiply_alpha(src_image, dst_image),
         }
     }
@@ -37,6 +41,8 @@ impl AlphaMulDiv for U8x2 {
             CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha_inplace(image) },
             _ => native::multiply_alpha_inplace(image),
         }
     }
@@ -53,6 +59,8 @@ impl AlphaMulDiv for U8x2 {
             CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_image, dst_image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::divide_alpha(src_image, dst_image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha(src_image, dst_image) },
             _ => native::divide_alpha(src_image, dst_image),
         }
     }
@@ -65,6 +73,8 @@ impl AlphaMulDiv for U8x2 {
             CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha_inplace(image) },
             _ => native::divide_alpha_inplace(image),
         }
     }
diff --git a/src/alpha/u8x2/wasm32.rs b/src/alpha/u8x2/wasm32.rs
new file mode 100644
index 0000000..0c51690
--- /dev/null
+++ b/src/alpha/u8x2/wasm32.rs
@@ -0,0 +1,233 @@
+use std::arch::wasm32::*;
+
+use crate::pixels::U8x2;
+use crate::utils::foreach_with_pre_reading;
+use crate::{ImageView, ImageViewMut};
+
+use super::native;
+
+pub(crate) unsafe fn multiply_alpha(
+    src_image: &ImageView<U8x2>,
+    dst_image: &mut ImageViewMut<U8x2>,
+) {
+    let src_rows = src_image.iter_rows(0);
+    let dst_rows = dst_image.iter_rows_mut();
+
+    for (src_row, dst_row) in src_rows.zip(dst_rows) {
+        multiply_alpha_row(src_row, dst_row);
+    }
+}
+
+pub(crate) unsafe fn multiply_alpha_inplace(image: &mut ImageViewMut<U8x2>) {
+    for row in image.iter_rows_mut() {
+        multiply_alpha_row_inplace(row);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn multiply_alpha_row(src_row: &[U8x2], dst_row: &mut [U8x2]) {
+    let src_chunks = src_row.chunks_exact(8);
+    let src_remainder = src_chunks.remainder();
+    let mut dst_chunks = dst_row.chunks_exact_mut(8);
+    let src_dst = src_chunks.zip(&mut dst_chunks);
+    foreach_with_pre_reading(
+        src_dst,
+        |(src, dst)| {
+            let pixels = v128_load(src.as_ptr() as *const v128);
+            let dst_ptr = dst.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = multiplies_alpha_8_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    if !src_remainder.is_empty() {
+        let dst_reminder = dst_chunks.into_remainder();
+        native::multiply_alpha_row(src_remainder, dst_reminder);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn multiply_alpha_row_inplace(row: &mut [U8x2]) {
+    let mut chunks = row.chunks_exact_mut(8);
+    // Using a simple for-loop in this case is faster than implementation with pre-reading
+    for chunk in &mut chunks {
+        let src_pixels = v128_load(chunk.as_ptr() as *const v128);
+        let dst_pixels = multiplies_alpha_8_pixels(src_pixels);
+        v128_store(chunk.as_mut_ptr() as *mut v128, dst_pixels);
+    }
+
+    let reminder = chunks.into_remainder();
+    if !reminder.is_empty() {
+        native::multiply_alpha_row_inplace(reminder);
+    }
+}
+
+#[inline]
+unsafe fn multiplies_alpha_8_pixels(pixels: v128) -> v128 {
+    let zero = i64x2_splat(0);
+    let half = i16x8_splat(128);
+    const MAX_A: i16 = 0xff00u16 as i16;
+    let max_alpha = i16x8_splat(MAX_A);
+    /*
+       |L  A | |L  A | |L  A | |L  A | |L  A | |L  A | |L  A | |L  A |
+       |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15|
+    */
+    const FACTOR_MASK: v128 = i8x16(1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
+
+    let factor_pixels = i8x16_swizzle(pixels, FACTOR_MASK);
+    let factor_pixels = v128_or(factor_pixels, max_alpha);
+
+    let src_i16_lo =
+        i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(pixels, zero);
+    let factors = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+        factor_pixels,
+        zero,
+    );
+    let src_i16_lo = i16x8_add(i16x8_mul(src_i16_lo, factors), half);
+    let dst_i16_lo = i16x8_add(src_i16_lo, u16x8_shr(src_i16_lo, 8));
+    let dst_i16_lo = u16x8_shr(dst_i16_lo, 8);
+
+    let src_i16_hi =
+        i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(pixels, zero);
+    let factors = i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+        factor_pixels,
+        zero,
+    );
+    let src_i16_hi = i16x8_add(i16x8_mul(src_i16_hi, factors), half);
+    let dst_i16_hi = i16x8_add(src_i16_hi, u16x8_shr(src_i16_hi, 8));
+    let dst_i16_hi = u16x8_shr(dst_i16_hi, 8);
+
+    u8x16_narrow_i16x8(dst_i16_lo, dst_i16_hi)
+}
+
+// Divide
+
+pub(crate) unsafe fn divide_alpha(src_image: &ImageView<U8x2>, dst_image: &mut ImageViewMut<U8x2>) {
+    let src_rows = src_image.iter_rows(0);
+    let dst_rows = dst_image.iter_rows_mut();
+
+    for (src_row, dst_row) in src_rows.zip(dst_rows) {
+        divide_alpha_row(src_row, dst_row);
+    }
+}
+
+pub(crate) unsafe fn divide_alpha_inplace(image: &mut ImageViewMut<U8x2>) {
+    for row in image.iter_rows_mut() {
+        divide_alpha_row_inplace(row);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn divide_alpha_row(src_row: &[U8x2], dst_row: &mut [U8x2]) {
+    let src_chunks = src_row.chunks_exact(8);
+    let src_remainder = src_chunks.remainder();
+    let mut dst_chunks = dst_row.chunks_exact_mut(8);
+    let src_dst = src_chunks.zip(&mut dst_chunks);
+    foreach_with_pre_reading(
+        src_dst,
+        |(src, dst)| {
+            let pixels = v128_load(src.as_ptr() as *const v128);
+            let dst_ptr = dst.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = divide_alpha_8_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    if !src_remainder.is_empty() {
+        let dst_reminder = dst_chunks.into_remainder();
+        let mut src_pixels = [U8x2::new(0); 8];
+        src_pixels
+            .iter_mut()
+            .zip(src_remainder)
+            .for_each(|(d, s)| *d = *s);
+
+        let mut dst_pixels = [U8x2::new(0); 8];
+        let mut pixels = v128_load(src_pixels.as_ptr() as *const v128);
+        pixels = divide_alpha_8_pixels(pixels);
+        v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels);
+
+        dst_pixels
+            .iter()
+            .zip(dst_reminder)
+            .for_each(|(s, d)| *d = *s);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn divide_alpha_row_inplace(row: &mut [U8x2]) {
+    let mut chunks = row.chunks_exact_mut(8);
+    foreach_with_pre_reading(
+        &mut chunks,
+        |chunk| {
+            let pixels = v128_load(chunk.as_ptr() as *const v128);
+            let dst_ptr = chunk.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = divide_alpha_8_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    let reminder = chunks.into_remainder();
+    if !reminder.is_empty() {
+        let mut src_pixels = [U8x2::new(0); 8];
+        src_pixels
+            .iter_mut()
+            .zip(reminder.iter())
+            .for_each(|(d, s)| *d = *s);
+
+        let mut dst_pixels = [U8x2::new(0); 8];
+        let mut pixels = v128_load(src_pixels.as_ptr() as *const v128);
+        pixels = divide_alpha_8_pixels(pixels);
+        v128_store(dst_pixels.as_mut_ptr() as *mut v128, pixels);
+
+        dst_pixels.iter().zip(reminder).for_each(|(s, d)| *d = *s);
+    }
+}
+
+#[inline]
+unsafe fn divide_alpha_8_pixels(pixels: v128) -> v128 {
+    let alpha_mask = i16x8_splat(0xff00u16 as i16);
+    let luma_mask = i16x8_splat(0xff);
+    const ALPHA32_SH_LO: v128 = i8x16(1, -1, -1, -1, 3, -1, -1, -1, 5, -1, -1, -1, 7, -1, -1, -1);
+    const ALPHA32_SH_HI: v128 = i8x16(
+        9, -1, -1, -1, 11, -1, -1, -1, 13, -1, -1, -1, 15, -1, -1, -1,
+    );
+    let alpha_scale = f32x4_splat(255.0 * 256.0);
+    // sse4 _mm_cvtps_epi32 converts inf to i32::MIN or 2147483648f32 u32.
+    // wasm32 u32x4_trunc_sat_f32x4 on AVX systems converts inf to u32::MAX.
+    // Tests pass without capping inf from dividing by zero, but scaled values will not match sse4,
+    // and other potential test cases will (probably?) break.
+    let alpha_scale_max = f32x4_splat(2147483648f32);
+
+    let alpha_lo_f32 = f32x4_convert_u32x4(i8x16_swizzle(pixels, ALPHA32_SH_LO));
+    // trunc_sat will always round down. Adding f32x4_nearest would match _mm_cvtps_epi32 exactly,
+    // but would add extra instructions.
+    let scaled_alpha_lo_u32 = u32x4_trunc_sat_f32x4(f32x4_pmin(
+        f32x4_div(alpha_scale, alpha_lo_f32),
+        alpha_scale_max,
+    ));
+    let alpha_hi_f32 = f32x4_convert_u32x4(i8x16_swizzle(pixels, ALPHA32_SH_HI));
+    let scaled_alpha_hi_u32 = u32x4_trunc_sat_f32x4(f32x4_pmin(
+        f32x4_div(alpha_scale, alpha_hi_f32),
+        alpha_scale_max,
+    ));
+    let scaled_alpha_u16 = u16x8_narrow_i32x4(scaled_alpha_lo_u32, scaled_alpha_hi_u32);
+
+    let luma_u16 = v128_and(pixels, luma_mask);
+    let scaled_luma_u16 = u16x8_mul(luma_u16, scaled_alpha_u16);
+    let scaled_luma_u16 = u16x8_shr(scaled_luma_u16, 8);
+
+    let alpha = v128_and(pixels, alpha_mask);
+    u8x16_shuffle::<0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31>(
+        scaled_luma_u16,
+        alpha,
+    )
+}
diff --git a/src/alpha/u8x4/mod.rs b/src/alpha/u8x4/mod.rs
index 9682f48..1f529a0 100644
--- a/src/alpha/u8x4/mod.rs
+++ b/src/alpha/u8x4/mod.rs
@@ -11,6 +11,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl AlphaMulDiv for U8x4 {
     fn multiply_alpha(
@@ -25,6 +27,8 @@ impl AlphaMulDiv for U8x4 {
             CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha(src_image, dst_image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::multiply_alpha(src_image, dst_image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha(src_image, dst_image) },
             _ => native::multiply_alpha(src_image, dst_image),
         }
     }
@@ -37,6 +41,8 @@ impl AlphaMulDiv for U8x4 {
             CpuExtensions::Sse4_1 => unsafe { sse4::multiply_alpha_inplace(image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::multiply_alpha_inplace(image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::multiply_alpha_inplace(image) },
             _ => native::multiply_alpha_inplace(image),
         }
     }
@@ -53,6 +59,8 @@ impl AlphaMulDiv for U8x4 {
             CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha(src_image, dst_image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::divide_alpha(src_image, dst_image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha(src_image, dst_image) },
             _ => native::divide_alpha(src_image, dst_image),
         }
     }
@@ -65,6 +73,8 @@ impl AlphaMulDiv for U8x4 {
             CpuExtensions::Sse4_1 => unsafe { sse4::divide_alpha_inplace(image) },
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => unsafe { neon::divide_alpha_inplace(image) },
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => unsafe { wasm32::divide_alpha_inplace(image) },
             _ => native::divide_alpha_inplace(image),
         }
     }
diff --git a/src/alpha/u8x4/wasm32.rs b/src/alpha/u8x4/wasm32.rs
new file mode 100644
index 0000000..502c4b6
--- /dev/null
+++ b/src/alpha/u8x4/wasm32.rs
@@ -0,0 +1,220 @@
+use std::arch::wasm32::*;
+
+use crate::pixels::U8x4;
+use crate::utils::foreach_with_pre_reading;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+use super::native;
+
+pub(crate) unsafe fn multiply_alpha(
+    src_image: &ImageView<U8x4>,
+    dst_image: &mut ImageViewMut<U8x4>,
+) {
+    let src_rows = src_image.iter_rows(0);
+    let dst_rows = dst_image.iter_rows_mut();
+
+    for (src_row, dst_row) in src_rows.zip(dst_rows) {
+        multiply_alpha_row(src_row, dst_row);
+    }
+}
+
+pub(crate) unsafe fn multiply_alpha_inplace(image: &mut ImageViewMut<U8x4>) {
+    for row in image.iter_rows_mut() {
+        multiply_alpha_row_inplace(row);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn multiply_alpha_row(src_row: &[U8x4], dst_row: &mut [U8x4]) {
+    let src_chunks = src_row.chunks_exact(4);
+    let src_remainder = src_chunks.remainder();
+    let mut dst_chunks = dst_row.chunks_exact_mut(4);
+    let src_dst = src_chunks.zip(&mut dst_chunks);
+    foreach_with_pre_reading(
+        src_dst,
+        |(src, dst)| {
+            let pixels = v128_load(src.as_ptr() as *const v128);
+            let dst_ptr = dst.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = multiply_alpha_4_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    if !src_remainder.is_empty() {
+        let dst_reminder = dst_chunks.into_remainder();
+        native::multiply_alpha_row(src_remainder, dst_reminder);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn multiply_alpha_row_inplace(row: &mut [U8x4]) {
+    let mut chunks = row.chunks_exact_mut(4);
+    // Using a simple for-loop in this case is faster than implementation with pre-reading
+    for chunk in &mut chunks {
+        let mut pixels = v128_load(chunk.as_ptr() as *const v128);
+        pixels = multiply_alpha_4_pixels(pixels);
+        v128_store(chunk.as_mut_ptr() as *mut v128, pixels);
+    }
+
+    let tail = chunks.into_remainder();
+    if !tail.is_empty() {
+        native::multiply_alpha_row_inplace(tail);
+    }
+}
+
+#[inline]
+unsafe fn multiply_alpha_4_pixels(pixels: v128) -> v128 {
+    let zero = i64x2_splat(0);
+    let half = i16x8_splat(128);
+    const MAX_A: u32 = 0xff000000u32;
+    let max_alpha = u32x4_splat(MAX_A);
+
+    const FACTOR_MASK: v128 = i8x16(3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15);
+
+    let factor_pixels = u8x16_swizzle(pixels, FACTOR_MASK);
+    let factor_pixels = v128_or(factor_pixels, max_alpha);
+
+    let pix1 =
+        i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(pixels, zero);
+    let factors = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+        factor_pixels,
+        zero,
+    );
+    let pix1 = i16x8_add(i16x8_mul(pix1, factors), half);
+    let pix1 = i16x8_add(pix1, u16x8_shr(pix1, 8));
+    let pix1 = u16x8_shr(pix1, 8);
+
+    let pix2 =
+        i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(pixels, zero);
+    let factors = i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+        factor_pixels,
+        zero,
+    );
+    let pix2 = i16x8_add(i16x8_mul(pix2, factors), half);
+    let pix2 = i16x8_add(pix2, u16x8_shr(pix2, 8));
+    let pix2 = u16x8_shr(pix2, 8);
+
+    u8x16_narrow_i16x8(pix1, pix2)
+}
+
+// Divide
+
+pub(crate) unsafe fn divide_alpha(src_image: &ImageView<U8x4>, dst_image: &mut ImageViewMut<U8x4>) {
+    let src_rows = src_image.iter_rows(0);
+    let dst_rows = dst_image.iter_rows_mut();
+    for (src_row, dst_row) in src_rows.zip(dst_rows) {
+        divide_alpha_row(src_row, dst_row);
+    }
+}
+
+pub(crate) unsafe fn divide_alpha_inplace(image: &mut ImageViewMut<U8x4>) {
+    for row in image.iter_rows_mut() {
+        divide_alpha_row_inplace(row);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn divide_alpha_row(src_row: &[U8x4], dst_row: &mut [U8x4]) {
+    let src_chunks = src_row.chunks_exact(4);
+    let src_remainder = src_chunks.remainder();
+    let mut dst_chunks = dst_row.chunks_exact_mut(4);
+    let src_dst = src_chunks.zip(&mut dst_chunks);
+    foreach_with_pre_reading(
+        src_dst,
+        |(src, dst)| {
+            let pixels = v128_load(src.as_ptr() as *const v128);
+            let dst_ptr = dst.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = divide_alpha_4_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    if !src_remainder.is_empty() {
+        let dst_reminder = dst_chunks.into_remainder();
+        let mut src_buffer = [U8x4::new(0); 4];
+        src_buffer
+            .iter_mut()
+            .zip(src_remainder)
+            .for_each(|(d, s)| *d = *s);
+
+        let mut dst_buffer = [U8x4::new(0); 4];
+        let src_pixels = v128_load(src_buffer.as_ptr() as *const v128);
+        let dst_pixels = divide_alpha_4_pixels(src_pixels);
+        v128_store(dst_buffer.as_mut_ptr() as *mut v128, dst_pixels);
+
+        dst_buffer
+            .iter()
+            .zip(dst_reminder)
+            .for_each(|(s, d)| *d = *s);
+    }
+}
+
+#[inline]
+pub(crate) unsafe fn divide_alpha_row_inplace(row: &mut [U8x4]) {
+    let mut chunks = row.chunks_exact_mut(4);
+    foreach_with_pre_reading(
+        &mut chunks,
+        |chunk| {
+            let pixels = v128_load(chunk.as_ptr() as *const v128);
+            let dst_ptr = chunk.as_mut_ptr() as *mut v128;
+            (pixels, dst_ptr)
+        },
+        |(mut pixels, dst_ptr)| {
+            pixels = divide_alpha_4_pixels(pixels);
+            v128_store(dst_ptr, pixels);
+        },
+    );
+
+    let tail = chunks.into_remainder();
+    if !tail.is_empty() {
+        let mut src_buffer = [U8x4::new(0); 4];
+        src_buffer
+            .iter_mut()
+            .zip(tail.iter())
+            .for_each(|(d, s)| *d = *s);
+
+        let mut dst_buffer = [U8x4::new(0); 4];
+        let src_pixels = v128_load(src_buffer.as_ptr() as *const v128);
+        let dst_pixels = divide_alpha_4_pixels(src_pixels);
+        v128_store(dst_buffer.as_mut_ptr() as *mut v128, dst_pixels);
+
+        dst_buffer.iter().zip(tail).for_each(|(s, d)| *d = *s);
+    }
+}
+
+#[inline]
+unsafe fn divide_alpha_4_pixels(src_pixels: v128) -> v128 {
+    let zero = i64x2_splat(0);
+    let alpha_mask = i32x4_splat(0xff000000u32 as i32);
+    const SHUFFLE1: v128 = i8x16(0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4, 5, 4, 5);
+    const SHUFFLE2: v128 = i8x16(8, 9, 8, 9, 8, 9, 8, 9, 12, 13, 12, 13, 12, 13, 12, 13);
+    let alpha_scale = f32x4_splat(255.0 * 256.0);
+    let alpha_scale_max = f32x4_splat(2147483648f32);
+
+    let alpha_f32 = f32x4_convert_i32x4(u32x4_shr(src_pixels, 24));
+    let scaled_alpha_f32 = f32x4_div(alpha_scale, alpha_f32);
+    let scaled_alpha_u32 = u32x4_trunc_sat_f32x4(f32x4_pmin(scaled_alpha_f32, alpha_scale_max));
+    let mma0 = u8x16_swizzle(scaled_alpha_u32, SHUFFLE1);
+    let mma1 = u8x16_swizzle(scaled_alpha_u32, SHUFFLE2);
+
+    let pix0 =
+        u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(zero, src_pixels);
+    let pix1 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+        zero, src_pixels,
+    );
+
+    let pix0 = wasm32_utils::u16x8_mul_hi(pix0, mma0);
+    let pix1 = wasm32_utils::u16x8_mul_hi(pix1, mma1);
+
+    let alpha = v128_and(src_pixels, alpha_mask);
+    let rgb = u8x16_narrow_i16x8(pix0, pix1);
+
+    u8x16_shuffle::<0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31>(rgb, alpha)
+}
diff --git a/src/convolution/u16x1/mod.rs b/src/convolution/u16x1/mod.rs
index 4e9e9bb..3bb33b6 100644
--- a/src/convolution/u16x1/mod.rs
+++ b/src/convolution/u16x1/mod.rs
@@ -12,6 +12,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl Convolution for U16 {
     fn horiz_convolution(
@@ -28,6 +30,10 @@ impl Convolution for U16 {
             CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs),
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs),
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => {
+                wasm32::horiz_convolution(src_image, dst_image, offset, coeffs)
+            }
             _ => native::horiz_convolution(src_image, dst_image, offset, coeffs),
         }
     }
diff --git a/src/convolution/u16x1/wasm32.rs b/src/convolution/u16x1/wasm32.rs
new file mode 100644
index 0000000..9e2c7aa
--- /dev/null
+++ b/src/convolution/u16x1/wasm32.rs
@@ -0,0 +1,284 @@
+use std::arch::wasm32::*;
+
+use crate::convolution::{optimisations, Coefficients};
+use crate::pixels::U16;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+#[inline]
+pub(crate) fn horiz_convolution(
+    src_image: &ImageView<U16>,
+    dst_image: &mut ImageViewMut<U16>,
+    offset: u32,
+    coeffs: Coefficients,
+) {
+    let normalizer = optimisations::Normalizer32::new(coeffs);
+    let coefficients_chunks = normalizer.normalized_chunks();
+    let dst_height = dst_image.height().get();
+
+    let src_iter = src_image.iter_4_rows(offset, dst_height + offset);
+    let dst_iter = dst_image.iter_4_rows_mut();
+    for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
+        unsafe {
+            horiz_convolution_four_rows(src_rows, dst_rows, &coefficients_chunks, &normalizer);
+        }
+    }
+
+    let mut yy = dst_height - dst_height % 4;
+    while yy < dst_height {
+        unsafe {
+            horiz_convolution_one_row(
+                src_image.get_row(yy + offset).unwrap(),
+                dst_image.get_row_mut(yy).unwrap(),
+                &coefficients_chunks,
+                &normalizer,
+            );
+        }
+        yy += 1;
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - length of all rows in src_rows must be equal
+/// - length of all rows in dst_rows must be equal
+/// - coefficients_chunks.len() == dst_rows.0.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_four_rows(
+    src_rows: [&[U16]; 4],
+    dst_rows: [&mut &mut [U16]; 4],
+    coefficients_chunks: &[optimisations::CoefficientsI32Chunk],
+    normalizer: &optimisations::Normalizer32,
+) {
+    let precision = normalizer.precision();
+    let half_error = 1i64 << (precision - 1);
+    let mut ll_buf = [0i64; 2];
+
+    /*
+        |L0  | |L1  | |L2  | |L3  | |L4  | |L5  | |L6  | |L7  |
+        |0001| |0203| |0405| |0607| |0809| |1011| |1213| |1415|
+
+        Shuffle to extract L0 and L1 as i64:
+        0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L2 and L3 as i64:
+        4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L4 and L5 as i64:
+        8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L6 and L7 as i64:
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1
+    */
+
+    const L0L1_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1);
+    const L2L3_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1);
+    const L4L5_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1);
+    const L6L7_SHUFFLE: v128 = i8x16(
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1,
+    );
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x: usize = coeffs_chunk.start as usize;
+        let mut ll_sum: [v128; 4] = [i64x2_splat(0i64); 4];
+
+        let mut coeffs = coeffs_chunk.values;
+
+        let coeffs_by_8 = coeffs.chunks_exact(8);
+        coeffs = coeffs_by_8.remainder();
+
+        for k in coeffs_by_8 {
+            let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64);
+            let coeff23_i64x2 = i64x2(k[2] as i64, k[3] as i64);
+            let coeff45_i64x2 = i64x2(k[4] as i64, k[5] as i64);
+            let coeff67_i64x2 = i64x2(k[6] as i64, k[7] as i64);
+
+            for i in 0..4 {
+                let mut sum = ll_sum[i];
+                let source = wasm32_utils::load_v128(src_rows[i], x);
+
+                let l0l1_i64x2 = i8x16_swizzle(source, L0L1_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l0l1_i64x2, coeff01_i64x2));
+
+                let l2l3_i64x2 = i8x16_swizzle(source, L2L3_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l2l3_i64x2, coeff23_i64x2));
+
+                let l4l5_i64x2 = i8x16_swizzle(source, L4L5_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l4l5_i64x2, coeff45_i64x2));
+
+                let l6l7_i64x2 = i8x16_swizzle(source, L6L7_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l6l7_i64x2, coeff67_i64x2));
+
+                ll_sum[i] = sum;
+            }
+            x += 8;
+        }
+
+        let coeffs_by_4 = coeffs.chunks_exact(4);
+        coeffs = coeffs_by_4.remainder();
+
+        for k in coeffs_by_4 {
+            let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64);
+            let coeff23_i64x2 = i64x2(k[2] as i64, k[3] as i64);
+
+            for i in 0..4 {
+                let mut sum = ll_sum[i];
+                let source = wasm32_utils::load_v128(src_rows[i], x);
+
+                let l0l1_i64x2 = i8x16_swizzle(source, L0L1_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l0l1_i64x2, coeff01_i64x2));
+
+                let l2l3_i64x2 = i8x16_swizzle(source, L2L3_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(l2l3_i64x2, coeff23_i64x2));
+
+                ll_sum[i] = sum;
+            }
+            x += 4;
+        }
+
+        let coeffs_by_2 = coeffs.chunks_exact(2);
+        coeffs = coeffs_by_2.remainder();
+
+        for k in coeffs_by_2 {
+            let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64);
+            for i in 0..4 {
+                let source = wasm32_utils::load_v128(src_rows[i], x);
+                let l_i64x2 = i8x16_swizzle(source, L0L1_SHUFFLE);
+                ll_sum[i] = i64x2_add(
+                    ll_sum[i],
+                    wasm32_utils::i64x2_mul_lo(l_i64x2, coeff01_i64x2),
+                );
+            }
+            x += 2;
+        }
+
+        if let Some(&k) = coeffs.first() {
+            let coeff01_i64x2 = i64x2(k as i64, 0);
+            for i in 0..4 {
+                let pixel = (*src_rows[i].get_unchecked(x)).0 as i64;
+                let source = i64x2(pixel, 0);
+                ll_sum[i] = i64x2_add(ll_sum[i], wasm32_utils::i64x2_mul_lo(source, coeff01_i64x2));
+            }
+        }
+
+        for i in 0..4 {
+            v128_store((&mut ll_buf).as_mut_ptr() as *mut v128, ll_sum[i]);
+            let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x);
+            dst_pixel.0 = normalizer.clip(ll_buf.iter().sum::<i64>() + half_error);
+        }
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - bounds.len() == dst_row.len()
+/// - coefficients_chunks.len() == dst_row.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len()
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_one_row(
+    src_row: &[U16],
+    dst_row: &mut [U16],
+    coefficients_chunks: &[optimisations::CoefficientsI32Chunk],
+    normalizer: &optimisations::Normalizer32,
+) {
+    let precision = normalizer.precision();
+    let half_error = 1i64 << (precision - 1);
+    let mut ll_buf = [0i64; 2];
+
+    /*
+        |L0  | |L1  | |L2  | |L3  | |L4  | |L5  | |L6  | |L7  |
+        |0001| |0203| |0405| |0607| |0809| |1011| |1213| |1415|
+
+        Shuffle to extract L0 and L1 as i64:
+        0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L2 and L3 as i64:
+        4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L4 and L5 as i64:
+        8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L6 and L7 as i64:
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1
+    */
+
+    const L01_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1);
+    const L23_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1);
+    const L45_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1);
+    const L67_SHUFFLE: v128 = i8x16(
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1,
+    );
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x: usize = coeffs_chunk.start as usize;
+        let mut ll_sum = i64x2_splat(0);
+        let mut coeffs = coeffs_chunk.values;
+
+        let coeffs_by_8 = coeffs.chunks_exact(8);
+        coeffs = coeffs_by_8.remainder();
+
+        for k in coeffs_by_8 {
+            let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64);
+            let coeff23_i64x2 = i64x2(k[2] as i64, k[3] as i64);
+            let coeff45_i64x2 = i64x2(k[4] as i64, k[5] as i64);
+            let coeff67_i64x2 = i64x2(k[6] as i64, k[7] as i64);
+
+            let source = wasm32_utils::load_v128(src_row, x);
+
+            let l_i64x2 = i8x16_swizzle(source, L01_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff01_i64x2));
+
+            let l_i64x2 = i8x16_swizzle(source, L23_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff23_i64x2));
+
+            let l_i64x2 = i8x16_swizzle(source, L45_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff45_i64x2));
+
+            let l_i64x2 = i8x16_swizzle(source, L67_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff67_i64x2));
+
+            x += 8;
+        }
+
+        let coeffs_by_4 = coeffs.chunks_exact(4);
+        coeffs = coeffs_by_4.remainder();
+
+        for k in coeffs_by_4 {
+            let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64);
+            let coeff23_i64x2 = i64x2(k[2] as i64, k[3] as i64);
+
+            let source = wasm32_utils::load_v128(src_row, x);
+
+            let l_i64x2 = i8x16_swizzle(source, L01_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff01_i64x2));
+
+            let l_i64x2 = i8x16_swizzle(source, L23_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff23_i64x2));
+
+            x += 4;
+        }
+
+        let coeffs_by_2 = coeffs.chunks_exact(2);
+        coeffs = coeffs_by_2.remainder();
+
+        for k in coeffs_by_2 {
+            let coeff01_i64x2 = i64x2(k[0] as i64, k[1] as i64);
+            let source = wasm32_utils::load_v128(src_row, x);
+
+            let l_i64x2 = i8x16_swizzle(source, L01_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(l_i64x2, coeff01_i64x2));
+
+            x += 2;
+        }
+
+        if let Some(&k) = coeffs.first() {
+            let coeff01_i64x2 = i64x2(k as i64, 0);
+            let pixel = (*src_row.get_unchecked(x)).0 as i64;
+            let source = i64x2(pixel, 0);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(source, coeff01_i64x2));
+        }
+
+        v128_store((&mut ll_buf).as_mut_ptr() as *mut v128, ll_sum);
+        let dst_pixel = dst_row.get_unchecked_mut(dst_x);
+        dst_pixel.0 = normalizer.clip(ll_buf[0] + ll_buf[1] + half_error);
+    }
+}
diff --git a/src/convolution/u16x2/mod.rs b/src/convolution/u16x2/mod.rs
index 8d830f6..7e4e8c3 100644
--- a/src/convolution/u16x2/mod.rs
+++ b/src/convolution/u16x2/mod.rs
@@ -12,6 +12,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl Convolution for U16x2 {
     fn horiz_convolution(
@@ -28,6 +30,10 @@ impl Convolution for U16x2 {
             CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs),
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs),
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => {
+                wasm32::horiz_convolution(src_image, dst_image, offset, coeffs)
+            }
             _ => native::horiz_convolution(src_image, dst_image, offset, coeffs),
         }
     }
diff --git a/src/convolution/u16x2/wasm32.rs b/src/convolution/u16x2/wasm32.rs
new file mode 100644
index 0000000..769784a
--- /dev/null
+++ b/src/convolution/u16x2/wasm32.rs
@@ -0,0 +1,259 @@
+use std::arch::wasm32::*;
+
+use crate::convolution::{optimisations, Coefficients};
+use crate::pixels::U16x2;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+#[inline]
+pub(crate) fn horiz_convolution(
+    src_image: &ImageView<U16x2>,
+    dst_image: &mut ImageViewMut<U16x2>,
+    offset: u32,
+    coeffs: Coefficients,
+) {
+    let normalizer = optimisations::Normalizer32::new(coeffs);
+    let coefficients_chunks = normalizer.normalized_chunks();
+    let dst_height = dst_image.height().get();
+
+    let src_iter = src_image.iter_4_rows(offset, dst_height + offset);
+    let dst_iter = dst_image.iter_4_rows_mut();
+    for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
+        unsafe {
+            horiz_convolution_four_rows(src_rows, dst_rows, &coefficients_chunks, &normalizer);
+        }
+    }
+
+    let mut yy = dst_height - dst_height % 4;
+    while yy < dst_height {
+        unsafe {
+            horiz_convolution_one_row(
+                src_image.get_row(yy + offset).unwrap(),
+                dst_image.get_row_mut(yy).unwrap(),
+                &coefficients_chunks,
+                &normalizer,
+            );
+        }
+        yy += 1;
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - length of all rows in src_rows must be equal
+/// - length of all rows in dst_rows must be equal
+/// - coefficients_chunks.len() == dst_rows.0.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_four_rows(
+    src_rows: [&[U16x2]; 4],
+    dst_rows: [&mut &mut [U16x2]; 4],
+    coefficients_chunks: &[optimisations::CoefficientsI32Chunk],
+    normalizer: &optimisations::Normalizer32,
+) {
+    let precision = normalizer.precision();
+    let half_error = 1i64 << (precision - 1);
+    let mut ll_buf = [0i64; 2];
+
+    /*
+       |L0   A0  | |L1   A1  | |L2   A2  | |L3   A3  |
+       |0001 0203| |0405 0607| |0809 1011| |1213 1415|
+
+        Shuffle to extract L0 and A0 as i64:
+        0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L1 and A1 as i64:
+        4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L2 and A2 as i64:
+        8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L3 and A3 as i64:
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1
+    */
+
+    const P0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1);
+    const P1_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1);
+    const P2_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1);
+    const P3_SHUFFLE: v128 = i8x16(
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1,
+    );
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x: usize = coeffs_chunk.start as usize;
+        let mut ll_sum = [i64x2_splat(half_error); 4];
+
+        let mut coeffs = coeffs_chunk.values;
+
+        let coeffs_by_4 = coeffs.chunks_exact(4);
+        coeffs = coeffs_by_4.remainder();
+
+        for k in coeffs_by_4 {
+            let coeff0_i64x2 = i64x2_splat(k[0] as i64);
+            let coeff1_i64x2 = i64x2_splat(k[1] as i64);
+            let coeff2_i64x2 = i64x2_splat(k[2] as i64);
+            let coeff3_i64x2 = i64x2_splat(k[3] as i64);
+
+            for i in 0..4 {
+                let mut sum = ll_sum[i];
+                let source = wasm32_utils::load_v128(src_rows[i], x);
+
+                let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2));
+
+                let p_i64x2 = i8x16_swizzle(source, P1_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff1_i64x2));
+
+                let p_i64x2 = i8x16_swizzle(source, P2_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff2_i64x2));
+
+                let p_i64x2 = i8x16_swizzle(source, P3_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff3_i64x2));
+
+                ll_sum[i] = sum;
+            }
+            x += 4;
+        }
+
+        let coeffs_by_2 = coeffs.chunks_exact(2);
+        coeffs = coeffs_by_2.remainder();
+
+        for k in coeffs_by_2 {
+            let coeff0_i64x2 = i64x2_splat(k[0] as i64);
+            let coeff1_i64x2 = i64x2_splat(k[1] as i64);
+
+            for i in 0..4 {
+                let mut sum = ll_sum[i];
+                let source = wasm32_utils::loadl_i64(src_rows[i], x);
+
+                let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2));
+
+                let p_i64x2 = i8x16_swizzle(source, P1_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff1_i64x2));
+
+                ll_sum[i] = sum;
+            }
+            x += 2;
+        }
+
+        if let Some(&k) = coeffs.first() {
+            let coeff0_i64x2 = i64x2_splat(k as i64);
+            for i in 0..4 {
+                let source = wasm32_utils::loadl_i32(src_rows[i], x);
+                let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE);
+                ll_sum[i] = i64x2_add(ll_sum[i], wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2));
+            }
+        }
+
+        for i in 0..4 {
+            v128_store((&mut ll_buf).as_mut_ptr() as *mut v128, ll_sum[i]);
+            let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x);
+            dst_pixel.0 = [normalizer.clip(ll_buf[0]), normalizer.clip(ll_buf[1])];
+        }
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - bounds.len() == dst_row.len()
+/// - coeffs.len() == dst_rows.0.len() * window_size
+/// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[inline]
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_one_row(
+    src_row: &[U16x2],
+    dst_row: &mut [U16x2],
+    coefficients_chunks: &[optimisations::CoefficientsI32Chunk],
+    normalizer: &optimisations::Normalizer32,
+) {
+    let precision = normalizer.precision();
+    let half_error = 1i64 << (precision - 1);
+    let mut ll_buf = [0i64; 2];
+
+    /*
+       |L0   A0  | |L1   A1  | |L2   A2  | |L3   A3  |
+       |0001 0203| |0405 0607| |0809 1011| |1213 1415|
+
+        Shuffle to extract L0 and A0 as i64:
+        0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L1 and A1 as i64:
+        4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L2 and A2 as i64:
+        8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract L3 and A3 as i64:
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1
+    */
+
+    const P0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1);
+    const P1_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1);
+    const P2_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1);
+    const P3_SHUFFLE: v128 = i8x16(
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1,
+    );
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x: usize = coeffs_chunk.start as usize;
+        let mut ll_sum = i64x2_splat(half_error);
+        let mut coeffs = coeffs_chunk.values;
+
+        let coeffs_by_4 = coeffs.chunks_exact(4);
+        coeffs = coeffs_by_4.remainder();
+
+        for k in coeffs_by_4 {
+            let coeff0_i64x2 = i64x2_splat(k[0] as i64);
+            let coeff1_i64x2 = i64x2_splat(k[1] as i64);
+            let coeff2_i64x2 = i64x2_splat(k[2] as i64);
+            let coeff3_i64x2 = i64x2_splat(k[3] as i64);
+
+            let source = wasm32_utils::load_v128(src_row, x);
+
+            let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2));
+
+            let p_i64x2 = i8x16_swizzle(source, P1_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff1_i64x2));
+
+            let p_i64x2 = i8x16_swizzle(source, P2_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff2_i64x2));
+
+            let p_i64x2 = i8x16_swizzle(source, P3_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff3_i64x2));
+
+            x += 4;
+        }
+
+        let coeffs_by_2 = coeffs.chunks_exact(2);
+        coeffs = coeffs_by_2.remainder();
+
+        for k in coeffs_by_2 {
+            let coeff0_i64x2 = i64x2_splat(k[0] as i64);
+            let coeff1_i64x2 = i64x2_splat(k[1] as i64);
+
+            let source = wasm32_utils::loadl_i64(src_row, x);
+
+            let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2));
+
+            let p_i64x2 = i8x16_swizzle(source, P1_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff1_i64x2));
+
+            x += 2;
+        }
+
+        if let Some(&k) = coeffs.first() {
+            let coeff0_i64x2 = i64x2_splat(k as i64);
+            let source = wasm32_utils::loadl_i32(src_row, x);
+
+            let p_i64x2 = i8x16_swizzle(source, P0_SHUFFLE);
+            ll_sum = i64x2_add(ll_sum, wasm32_utils::i64x2_mul_lo(p_i64x2, coeff0_i64x2));
+        }
+
+        v128_store((&mut ll_buf).as_mut_ptr() as *mut v128, ll_sum);
+        let dst_pixel = dst_row.get_unchecked_mut(dst_x);
+        dst_pixel.0 = [normalizer.clip(ll_buf[0]), normalizer.clip(ll_buf[1])];
+    }
+}
diff --git a/src/convolution/u16x3/mod.rs b/src/convolution/u16x3/mod.rs
index 8319e00..bffbd8e 100644
--- a/src/convolution/u16x3/mod.rs
+++ b/src/convolution/u16x3/mod.rs
@@ -12,6 +12,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl Convolution for U16x3 {
     fn horiz_convolution(
@@ -28,6 +30,10 @@ impl Convolution for U16x3 {
             CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs),
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs),
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => {
+                wasm32::horiz_convolution(src_image, dst_image, offset, coeffs)
+            }
             _ => native::horiz_convolution(src_image, dst_image, offset, coeffs),
         }
     }
diff --git a/src/convolution/u16x3/wasm32.rs b/src/convolution/u16x3/wasm32.rs
new file mode 100644
index 0000000..c16b7de
--- /dev/null
+++ b/src/convolution/u16x3/wasm32.rs
@@ -0,0 +1,236 @@
+use std::arch::wasm32::*;
+
+use crate::convolution::optimisations::CoefficientsI32Chunk;
+use crate::convolution::{optimisations, Coefficients};
+use crate::pixels::U16x3;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+#[inline]
+pub(crate) fn horiz_convolution(
+    src_image: &ImageView<U16x3>,
+    dst_image: &mut ImageViewMut<U16x3>,
+    offset: u32,
+    coeffs: Coefficients,
+) {
+    let normalizer = optimisations::Normalizer32::new(coeffs);
+    let coefficients_chunks = normalizer.normalized_chunks();
+    let dst_height = dst_image.height().get();
+
+    let src_iter = src_image.iter_4_rows(offset, dst_height + offset);
+    let dst_iter = dst_image.iter_4_rows_mut();
+    for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
+        unsafe {
+            horiz_convolution_8u4x(src_rows, dst_rows, &coefficients_chunks, &normalizer);
+        }
+    }
+
+    let mut yy = dst_height - dst_height % 4;
+    while yy < dst_height {
+        unsafe {
+            horiz_convolution_8u(
+                src_image.get_row(yy + offset).unwrap(),
+                dst_image.get_row_mut(yy).unwrap(),
+                &coefficients_chunks,
+                &normalizer,
+            );
+        }
+        yy += 1;
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - length of all rows in src_rows must be equal
+/// - length of all rows in dst_rows must be equal
+/// - coefficients_chunks.len() == dst_rows.0.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_8u4x(
+    src_rows: [&[U16x3]; 4],
+    dst_rows: [&mut &mut [U16x3]; 4],
+    coefficients_chunks: &[CoefficientsI32Chunk],
+    normalizer: &optimisations::Normalizer32,
+) {
+    const ZERO: v128 = i64x2(0, 0);
+    let precision = normalizer.precision();
+    let half_error = 1i64 << (precision - 1);
+    let mut rg_buf = [0i64; 2];
+    let mut bb_buf = [0i64; 2];
+
+    /*
+        |R    G    B   | |R    G    B   | |R    G   |
+        |0001 0203 0405| |0607 0809 1011| |1213 1415|
+
+        Shuffle to extract RG components of first pixel as i64:
+        0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract RG components of second pixel as i64:
+        6, 7, -1, -1, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract B components of two pixels as i64:
+        4, 5, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1
+
+    */
+
+    const RG0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1);
+    const RG1_SHUFFLE: v128 = i8x16(6, 7, -1, -1, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, -1, -1);
+    const BB_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1);
+
+    let width = src_rows[0].len();
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x: usize = coeffs_chunk.start as usize;
+        let mut rg_sum = [ZERO; 4];
+        let mut bb_sum = [ZERO; 4];
+
+        let mut coeffs = coeffs_chunk.values;
+        let end_x = x + coeffs.len();
+
+        if width - end_x >= 1 {
+            let coeffs_by_2 = coeffs.chunks_exact(2);
+            coeffs = coeffs_by_2.remainder();
+
+            for k in coeffs_by_2 {
+                let coeff0_i64x2 = i64x2_splat(k[0] as i64);
+                let coeff1_i64x2 = i64x2_splat(k[1] as i64);
+                let coeff_i64x2 = i64x2(k[0] as i64, k[1] as i64);
+
+                for i in 0..4 {
+                    let source = wasm32_utils::load_v128(src_rows[i], x);
+
+                    let rg0_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE);
+                    rg_sum[i] = i64x2_add(
+                        rg_sum[i],
+                        wasm32_utils::i64x2_mul_lo(rg0_i64x2, coeff0_i64x2),
+                    );
+
+                    let rg1_i64x2 = i8x16_swizzle(source, RG1_SHUFFLE);
+                    rg_sum[i] = i64x2_add(
+                        rg_sum[i],
+                        wasm32_utils::i64x2_mul_lo(rg1_i64x2, coeff1_i64x2),
+                    );
+
+                    let bb_i64x2 = i8x16_swizzle(source, BB_SHUFFLE);
+                    bb_sum[i] =
+                        i64x2_add(bb_sum[i], wasm32_utils::i64x2_mul_lo(bb_i64x2, coeff_i64x2));
+                }
+                x += 2;
+            }
+        }
+
+        for &k in coeffs {
+            let coeff_i64x2 = i64x2_splat(k as i64);
+
+            for i in 0..4 {
+                let &pixel = src_rows[i].get_unchecked(x);
+                let rg_i64x2 = i64x2(pixel.0[0] as i64, pixel.0[1] as i64);
+                rg_sum[i] = i64x2_add(rg_sum[i], wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff_i64x2));
+                let bb_i64x2 = i64x2(pixel.0[2] as i64, 0);
+                bb_sum[i] = i64x2_add(bb_sum[i], wasm32_utils::i64x2_mul_lo(bb_i64x2, coeff_i64x2));
+            }
+            x += 1;
+        }
+
+        for i in 0..4 {
+            v128_store((&mut rg_buf).as_mut_ptr() as *mut v128, rg_sum[i]);
+            v128_store((&mut bb_buf).as_mut_ptr() as *mut v128, bb_sum[i]);
+            let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x);
+            dst_pixel.0[0] = normalizer.clip(rg_buf[0] + half_error);
+            dst_pixel.0[1] = normalizer.clip(rg_buf[1] + half_error);
+            dst_pixel.0[2] = normalizer.clip(bb_buf[0] + bb_buf[1] + half_error);
+        }
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - bounds.len() == dst_row.len()
+/// - coefficients_chunks.len() == dst_row.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_8u(
+    src_row: &[U16x3],
+    dst_row: &mut [U16x3],
+    coefficients_chunks: &[CoefficientsI32Chunk],
+    normalizer: &optimisations::Normalizer32,
+) {
+    let precision = normalizer.precision();
+    let rg_initial = i64x2_splat(1 << (precision - 1));
+    let bb_initial = i64x2_splat(1 << (precision - 2));
+
+    /*
+        |R    G    B   | |R    G    B   | |R    G   |
+        |0001 0203 0405| |0607 0809 1011| |1213 1415|
+
+        Shuffle to extract RG components of first pixel as i64:
+        0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract RG components of second pixel as i64:
+        6, 7, -1, -1, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract B components of two pixels as i64:
+        4, 5, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1
+
+    */
+
+    const RG0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1);
+    const RG1_SHUFFLE: v128 = i8x16(6, 7, -1, -1, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, -1, -1);
+    const BB_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1);
+    let mut rg_buf = [0i64; 2];
+    let mut bb_buf = [0i64; 2];
+
+    let width = src_row.len();
+
+    for (dst_x, &coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x: usize = coeffs_chunk.start as usize;
+
+        let mut rg_sum = rg_initial;
+        let mut bb_sum = bb_initial;
+
+        let mut coeffs = coeffs_chunk.values;
+        let end_x = x + coeffs.len();
+
+        if width - end_x >= 1 {
+            let coeffs_by_2 = coeffs.chunks_exact(2);
+            coeffs = coeffs_by_2.remainder();
+
+            for k in coeffs_by_2 {
+                let coeff0_i64x2 = i64x2_splat(k[0] as i64);
+                let coeff1_i64x2 = i64x2_splat(k[1] as i64);
+                let coeff_i64x2 = i64x2(k[0] as i64, k[1] as i64);
+
+                let source = wasm32_utils::load_v128(src_row, x);
+
+                let rg0_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE);
+                rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg0_i64x2, coeff0_i64x2));
+
+                let rg1_i64x2 = i8x16_swizzle(source, RG1_SHUFFLE);
+                rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg1_i64x2, coeff1_i64x2));
+
+                let bb_i64x2 = i8x16_swizzle(source, BB_SHUFFLE);
+                bb_sum = i64x2_add(bb_sum, wasm32_utils::i64x2_mul_lo(bb_i64x2, coeff_i64x2));
+                x += 2;
+            }
+        }
+
+        for &k in coeffs {
+            let coeff_i64x2 = i64x2_splat(k as i64);
+
+            let &pixel = src_row.get_unchecked(x);
+            let rg_i64x2 = i64x2(pixel.0[0] as i64, pixel.0[1] as i64);
+            rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff_i64x2));
+            let bb_i64x2 = i64x2(pixel.0[2] as i64, 0);
+            bb_sum = i64x2_add(bb_sum, wasm32_utils::i64x2_mul_lo(bb_i64x2, coeff_i64x2));
+
+            x += 1;
+        }
+
+        v128_store((&mut rg_buf).as_mut_ptr() as *mut v128, rg_sum);
+        v128_store((&mut bb_buf).as_mut_ptr() as *mut v128, bb_sum);
+        let dst_pixel = dst_row.get_unchecked_mut(dst_x);
+        dst_pixel.0[0] = normalizer.clip(rg_buf[0]);
+        dst_pixel.0[1] = normalizer.clip(rg_buf[1]);
+        dst_pixel.0[2] = normalizer.clip(bb_buf[0] + bb_buf[1]);
+    }
+}
diff --git a/src/convolution/u16x4/mod.rs b/src/convolution/u16x4/mod.rs
index a0d8f4c..d9df251 100644
--- a/src/convolution/u16x4/mod.rs
+++ b/src/convolution/u16x4/mod.rs
@@ -12,6 +12,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl Convolution for U16x4 {
     fn horiz_convolution(
@@ -28,6 +30,10 @@ impl Convolution for U16x4 {
             CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs),
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs),
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => {
+                wasm32::horiz_convolution(src_image, dst_image, offset, coeffs)
+            }
             _ => native::horiz_convolution(src_image, dst_image, offset, coeffs),
         }
     }
diff --git a/src/convolution/u16x4/wasm32.rs b/src/convolution/u16x4/wasm32.rs
new file mode 100644
index 0000000..670efb2
--- /dev/null
+++ b/src/convolution/u16x4/wasm32.rs
@@ -0,0 +1,236 @@
+use std::arch::wasm32::*;
+
+use crate::convolution::{optimisations, Coefficients};
+use crate::pixels::U16x4;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+#[inline]
+pub(crate) fn horiz_convolution(
+    src_image: &ImageView<U16x4>,
+    dst_image: &mut ImageViewMut<U16x4>,
+    offset: u32,
+    coeffs: Coefficients,
+) {
+    let normalizer = optimisations::Normalizer32::new(coeffs);
+    let coefficients_chunks = normalizer.normalized_chunks();
+    let dst_height = dst_image.height().get();
+
+    let src_iter = src_image.iter_4_rows(offset, dst_height + offset);
+    let dst_iter = dst_image.iter_4_rows_mut();
+    for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
+        unsafe {
+            horiz_convolution_four_rows(src_rows, dst_rows, &coefficients_chunks, &normalizer);
+        }
+    }
+
+    let mut yy = dst_height - dst_height % 4;
+    while yy < dst_height {
+        unsafe {
+            horiz_convolution_one_row(
+                src_image.get_row(yy + offset).unwrap(),
+                dst_image.get_row_mut(yy).unwrap(),
+                &coefficients_chunks,
+                &normalizer,
+            );
+        }
+        yy += 1;
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - length of all rows in src_rows must be equal
+/// - length of all rows in dst_rows must be equal
+/// - coefficients_chunks.len() == dst_rows.0.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_four_rows(
+    src_rows: [&[U16x4]; 4],
+    dst_rows: [&mut &mut [U16x4]; 4],
+    coefficients_chunks: &[optimisations::CoefficientsI32Chunk],
+    normalizer: &optimisations::Normalizer32,
+) {
+    let precision = normalizer.precision();
+    let half_error = 1i64 << (precision - 1);
+    let mut rg_buf = [0i64; 2];
+    let mut ba_buf = [0i64; 2];
+
+    /*
+       |R0   G0   B0   A0  | |R1   G1   B1   A1  |
+       |0001 0203 0405 0607| |0809 1011 1213 1415|
+
+        Shuffle to extract R0 and G0 as i64:
+        0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract R1 and G1 as i64:
+        8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract B0 and A0 as i64:
+        4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract B1 and A1 as i64:
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1
+    */
+
+    const RG0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1);
+    const RG1_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1);
+    const BA0_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1);
+    const BA1_SHUFFLE: v128 = i8x16(
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1,
+    );
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x: usize = coeffs_chunk.start as usize;
+        let mut rg_sum = [i64x2_splat(half_error); 4];
+        let mut ba_sum = [i64x2_splat(half_error); 4];
+
+        let mut coeffs = coeffs_chunk.values;
+
+        let coeffs_by_2 = coeffs.chunks_exact(2);
+        coeffs = coeffs_by_2.remainder();
+
+        for k in coeffs_by_2 {
+            let coeff0_i64x2 = i64x2_splat(k[0] as i64);
+            let coeff1_i64x2 = i64x2_splat(k[1] as i64);
+
+            for i in 0..4 {
+                let source = wasm32_utils::load_v128(src_rows[i], x);
+                let mut sum = rg_sum[i];
+                let rg_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2));
+                let rg_i64x2 = i8x16_swizzle(source, RG1_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff1_i64x2));
+                rg_sum[i] = sum;
+
+                let mut sum = ba_sum[i];
+                let ba_i64x2 = i8x16_swizzle(source, BA0_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2));
+                let ba_i64x2 = i8x16_swizzle(source, BA1_SHUFFLE);
+                sum = i64x2_add(sum, wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff1_i64x2));
+                ba_sum[i] = sum;
+            }
+            x += 2;
+        }
+
+        if let Some(&k) = coeffs.first() {
+            let coeff0_i64x2 = i64x2_splat(k as i64);
+            for i in 0..4 {
+                let source = wasm32_utils::loadl_i64(src_rows[i], x);
+                let rg_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE);
+                rg_sum[i] = i64x2_add(
+                    rg_sum[i],
+                    wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2),
+                );
+                let ba_i64x2 = i8x16_swizzle(source, BA0_SHUFFLE);
+                ba_sum[i] = i64x2_add(
+                    ba_sum[i],
+                    wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2),
+                );
+            }
+        }
+
+        for i in 0..4 {
+            v128_store((&mut rg_buf).as_mut_ptr() as *mut v128, rg_sum[i]);
+            v128_store((&mut ba_buf).as_mut_ptr() as *mut v128, ba_sum[i]);
+            let dst_pixel = dst_rows[i].get_unchecked_mut(dst_x);
+            dst_pixel.0 = [
+                normalizer.clip(rg_buf[0]),
+                normalizer.clip(rg_buf[1]),
+                normalizer.clip(ba_buf[0]),
+                normalizer.clip(ba_buf[1]),
+            ];
+        }
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - bounds.len() == dst_row.len()
+/// - coeffs.len() == dst_rows.0.len() * window_size
+/// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[inline]
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_one_row(
+    src_row: &[U16x4],
+    dst_row: &mut [U16x4],
+    coefficients_chunks: &[optimisations::CoefficientsI32Chunk],
+    normalizer: &optimisations::Normalizer32,
+) {
+    let precision = normalizer.precision();
+    let half_error = 1i64 << (precision - 1);
+    let mut rg_buf = [0i64; 2];
+    let mut ba_buf = [0i64; 2];
+
+    /*
+       |R0   G0   B0   A0  | |R1   G1   B1   A1  |
+       |0001 0203 0405 0607| |0809 1011 1213 1415|
+
+        Shuffle to extract R0 and G0 as i64:
+        0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract R1 and G1 as i64:
+        8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract B0 and A0 as i64:
+        4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract B1 and A1 as i64:
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1
+    */
+
+    const RG0_SHUFFLE: v128 = i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1);
+    const RG1_SHUFFLE: v128 = i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1);
+    const BA0_SHUFFLE: v128 = i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1);
+    const BA1_SHUFFLE: v128 = i8x16(
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1,
+    );
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x: usize = coeffs_chunk.start as usize;
+        let mut coeffs = coeffs_chunk.values;
+        let mut rg_sum = i64x2_splat(half_error);
+        let mut ba_sum = i64x2_splat(half_error);
+
+        let coeffs_by_2 = coeffs.chunks_exact(2);
+        coeffs = coeffs_by_2.remainder();
+
+        for k in coeffs_by_2 {
+            let coeff0_i64x2 = i64x2_splat(k[0] as i64);
+            let coeff1_i64x2 = i64x2_splat(k[1] as i64);
+
+            let source = wasm32_utils::load_v128(src_row, x);
+
+            let rg_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE);
+            rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2));
+            let rg_i64x2 = i8x16_swizzle(source, RG1_SHUFFLE);
+            rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff1_i64x2));
+
+            let ba_i64x2 = i8x16_swizzle(source, BA0_SHUFFLE);
+            ba_sum = i64x2_add(ba_sum, wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2));
+            let ba_i64x2 = i8x16_swizzle(source, BA1_SHUFFLE);
+            ba_sum = i64x2_add(ba_sum, wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff1_i64x2));
+
+            x += 2;
+        }
+
+        if let Some(&k) = coeffs.first() {
+            let coeff0_i64x2 = i64x2_splat(k as i64);
+            let source = wasm32_utils::loadl_i64(src_row, x);
+            let rg_i64x2 = i8x16_swizzle(source, RG0_SHUFFLE);
+            rg_sum = i64x2_add(rg_sum, wasm32_utils::i64x2_mul_lo(rg_i64x2, coeff0_i64x2));
+            let ba_i64x2 = i8x16_swizzle(source, BA0_SHUFFLE);
+            ba_sum = i64x2_add(ba_sum, wasm32_utils::i64x2_mul_lo(ba_i64x2, coeff0_i64x2));
+        }
+
+        v128_store((&mut rg_buf).as_mut_ptr() as *mut v128, rg_sum);
+        v128_store((&mut ba_buf).as_mut_ptr() as *mut v128, ba_sum);
+        let dst_pixel = dst_row.get_unchecked_mut(dst_x);
+        dst_pixel.0 = [
+            normalizer.clip(rg_buf[0]),
+            normalizer.clip(rg_buf[1]),
+            normalizer.clip(ba_buf[0]),
+            normalizer.clip(ba_buf[1]),
+        ];
+    }
+}
diff --git a/src/convolution/u8x1/mod.rs b/src/convolution/u8x1/mod.rs
index 45141cb..5697e49 100644
--- a/src/convolution/u8x1/mod.rs
+++ b/src/convolution/u8x1/mod.rs
@@ -12,6 +12,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl Convolution for U8 {
     fn horiz_convolution(
@@ -28,6 +30,10 @@ impl Convolution for U8 {
             CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs),
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs),
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => {
+                wasm32::horiz_convolution(src_image, dst_image, offset, coeffs)
+            }
             _ => native::horiz_convolution(src_image, dst_image, offset, coeffs),
         }
     }
diff --git a/src/convolution/u8x1/wasm32.rs b/src/convolution/u8x1/wasm32.rs
new file mode 100644
index 0000000..517345f
--- /dev/null
+++ b/src/convolution/u8x1/wasm32.rs
@@ -0,0 +1,163 @@
+use std::arch::wasm32::*;
+
+use crate::convolution::{optimisations, Coefficients};
+use crate::pixels::U8;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+#[inline]
+pub(crate) fn horiz_convolution(
+    src_image: &ImageView<U8>,
+    dst_image: &mut ImageViewMut<U8>,
+    offset: u32,
+    coeffs: Coefficients,
+) {
+    let normalizer = optimisations::Normalizer16::new(coeffs);
+    let coefficients_chunks = normalizer.normalized_chunks();
+    let dst_height = dst_image.height().get();
+
+    let src_iter = src_image.iter_4_rows(offset, dst_height + offset);
+    let dst_iter = dst_image.iter_4_rows_mut();
+    for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
+        unsafe {
+            horiz_convolution_four_rows(src_rows, dst_rows, &coefficients_chunks, &normalizer);
+        }
+    }
+
+    let mut yy = dst_height - dst_height % 4;
+    while yy < dst_height {
+        unsafe {
+            horiz_convolution_row(
+                src_image.get_row(yy + offset).unwrap(),
+                dst_image.get_row_mut(yy).unwrap(),
+                &coefficients_chunks,
+                &normalizer,
+            );
+        }
+        yy += 1;
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - length of all rows in src_rows must be equal
+/// - length of all rows in dst_rows must be equal
+/// - coefficients_chunks.len() == dst_rows.0.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[inline]
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_four_rows(
+    src_rows: [&[U8]; 4],
+    dst_rows: [&mut &mut [U8]; 4],
+    coefficients_chunks: &[optimisations::CoefficientsI16Chunk],
+    normalizer: &optimisations::Normalizer16,
+) {
+    const ZERO: v128 = i64x2(0, 0);
+    let initial = 1 << (normalizer.precision() - 1);
+    let mut buf = [0, 0, 0, 0, initial];
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let coeffs = coeffs_chunk.values;
+        let mut x = coeffs_chunk.start as usize;
+        let mut result_i32x4 = [ZERO, ZERO, ZERO, ZERO];
+
+        let coeffs_by_8 = coeffs.chunks_exact(8);
+        let reminder8 = coeffs_by_8.remainder();
+        for k in coeffs_by_8 {
+            let coeffs_i16x8 = v128_load(k.as_ptr() as *const v128);
+            for i in 0..4 {
+                let pixels_u8x8 = wasm32_utils::loadl_i64(src_rows[i], x);
+                let pixels_i16x8 = u16x8_extend_low_u8x16(pixels_u8x8);
+                result_i32x4[i] =
+                    i32x4_add(result_i32x4[i], i32x4_dot_i16x8(pixels_i16x8, coeffs_i16x8));
+            }
+            x += 8;
+        }
+
+        let mut coeffs_by_4 = reminder8.chunks_exact(4);
+        let reminder4 = coeffs_by_4.remainder();
+        if let Some(k) = coeffs_by_4.next() {
+            let coeffs_i16x4 = wasm32_utils::loadl_i64(k, 0);
+            for i in 0..4 {
+                let pixels_u8x4 = wasm32_utils::loadl_i32(src_rows[i], x);
+                let pixels_i16x4 = u16x8_extend_low_u8x16(pixels_u8x4);
+                result_i32x4[i] =
+                    i32x4_add(result_i32x4[i], i32x4_dot_i16x8(pixels_i16x4, coeffs_i16x4));
+            }
+            x += 4;
+        }
+
+        let mut result_i32x4 = result_i32x4.map(|v| {
+            v128_store(buf.as_mut_ptr() as *mut v128, v);
+            buf.iter().sum()
+        });
+
+        for &coeff in reminder4 {
+            let coeff_i32 = coeff as i32;
+            for i in 0..4 {
+                result_i32x4[i] += src_rows[i].get_unchecked(x).0.to_owned() as i32 * coeff_i32;
+            }
+            x += 1;
+        }
+
+        let result_u8x4 = result_i32x4.map(|v| normalizer.clip(v));
+        for i in 0..4 {
+            dst_rows[i].get_unchecked_mut(dst_x).0 = result_u8x4[i];
+        }
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - bounds.len() == dst_row.len()
+/// - coeffs.len() == dst_rows.0.len() * window_size
+/// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[inline]
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_row(
+    src_row: &[U8],
+    dst_row: &mut [U8],
+    coefficients_chunks: &[optimisations::CoefficientsI16Chunk],
+    normalizer: &optimisations::Normalizer16,
+) {
+    const ZERO: v128 = i64x2(0, 0);
+    let initial = 1 << (normalizer.precision() - 1);
+    let mut buf = [0, 0, 0, 0, initial];
+
+    for (dst_x, &coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let coeffs = coeffs_chunk.values;
+        let mut x = coeffs_chunk.start as usize;
+        let mut result_i32x4 = ZERO;
+
+        let coeffs_by_8 = coeffs.chunks_exact(8);
+        let reminder8 = coeffs_by_8.remainder();
+        for k in coeffs_by_8 {
+            let coeffs_i16x8 = v128_load(k.as_ptr() as *const v128);
+            let pixels_u8x8 = wasm32_utils::loadl_i64(src_row, x);
+            let pixels_i16x8 = u16x8_extend_low_u8x16(pixels_u8x8);
+            result_i32x4 = i32x4_add(result_i32x4, i32x4_dot_i16x8(pixels_i16x8, coeffs_i16x8));
+            x += 8;
+        }
+
+        let mut coeffs_by_4 = reminder8.chunks_exact(4);
+        let reminder4 = coeffs_by_4.remainder();
+        if let Some(k) = coeffs_by_4.next() {
+            let coeffs_i16x4 = wasm32_utils::loadl_i64(k, 0);
+            let pixels_u8x4 = wasm32_utils::loadl_i32(src_row, x);
+            let pixels_i16x4 = u16x8_extend_low_u8x16(pixels_u8x4);
+            result_i32x4 = i32x4_add(result_i32x4, i32x4_dot_i16x8(pixels_i16x4, coeffs_i16x4));
+            x += 4;
+        }
+
+        v128_store(buf.as_mut_ptr() as *mut v128, result_i32x4);
+        let mut result_i32 = buf.iter().sum();
+
+        for &coeff in reminder4 {
+            let coeff_i32 = coeff as i32;
+            result_i32 += src_row.get_unchecked(x).0 as i32 * coeff_i32;
+            x += 1;
+        }
+
+        dst_row.get_unchecked_mut(dst_x).0 = normalizer.clip(result_i32);
+    }
+}
diff --git a/src/convolution/u8x2/mod.rs b/src/convolution/u8x2/mod.rs
index 29c99fc..2197b97 100644
--- a/src/convolution/u8x2/mod.rs
+++ b/src/convolution/u8x2/mod.rs
@@ -12,6 +12,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl Convolution for U8x2 {
     fn horiz_convolution(
@@ -28,6 +30,10 @@ impl Convolution for U8x2 {
             CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs),
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs),
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => {
+                wasm32::horiz_convolution(src_image, dst_image, offset, coeffs)
+            }
             _ => native::horiz_convolution(src_image, dst_image, offset, coeffs),
         }
     }
diff --git a/src/convolution/u8x2/wasm32.rs b/src/convolution/u8x2/wasm32.rs
new file mode 100644
index 0000000..fec5f0b
--- /dev/null
+++ b/src/convolution/u8x2/wasm32.rs
@@ -0,0 +1,320 @@
+use std::arch::wasm32::*;
+
+use crate::convolution::{optimisations, Coefficients};
+use crate::pixels::U8x2;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+#[inline]
+pub(crate) fn horiz_convolution(
+    src_image: &ImageView<U8x2>,
+    dst_image: &mut ImageViewMut<U8x2>,
+    offset: u32,
+    coeffs: Coefficients,
+) {
+    let normalizer = optimisations::Normalizer16::new(coeffs);
+    let coefficients_chunks = normalizer.normalized_chunks();
+    let dst_height = dst_image.height().get();
+
+    let src_iter = src_image.iter_4_rows(offset, dst_height + offset);
+    let dst_iter = dst_image.iter_4_rows_mut();
+    for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
+        unsafe {
+            horiz_convolution_four_rows(src_rows, dst_rows, &coefficients_chunks, &normalizer);
+        }
+    }
+
+    let mut yy = dst_height - dst_height % 4;
+    while yy < dst_height {
+        unsafe {
+            horiz_convolution_one_row(
+                src_image.get_row(yy + offset).unwrap(),
+                dst_image.get_row_mut(yy).unwrap(),
+                &coefficients_chunks,
+                &normalizer,
+            );
+        }
+        yy += 1;
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - length of all rows in src_rows must be equal
+/// - length of all rows in dst_rows must be equal
+/// - coefficients_chunks.len() == dst_rows.0.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[inline]
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_four_rows(
+    src_rows: [&[U8x2]; 4],
+    dst_rows: [&mut &mut [U8x2]; 4],
+    coefficients_chunks: &[optimisations::CoefficientsI16Chunk],
+    normalizer: &optimisations::Normalizer16,
+) {
+    let precision = normalizer.precision();
+    let initial = i32x4_splat(1 << (precision - 2));
+
+    /*
+        |L  A | |L  A | |L  A | |L  A | |L  A | |L  A | |L  A | |L  A |
+        |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15|
+
+        Shuffle components with converting from u8 into i16:
+
+        A: |-1 07| |-1 05| |-1 03| |-1 01|
+        L: |-1 06| |-1 04| |-1 02| |-1 00|
+    */
+    #[rustfmt::skip]
+    const SH1: v128 = i8x16(
+        0, -1, 2, -1, 4, -1, 6, -1, 1, -1, 3, -1, 5, -1, 7, -1
+    );
+    /*
+        A: |-1 15| |-1 13| |-1 11| |-1 09|
+        L: |-1 14| |-1 12| |-1 10| |-1 08|
+    */
+    #[rustfmt::skip]
+    const SH2: v128 = i8x16(
+        8, -1, 10, -1, 12, -1, 14, -1, 9, -1, 11, -1, 13, -1, 15, -1
+    );
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x = coeffs_chunk.start as usize;
+
+        let mut sss: [v128; 4] = [initial; 4];
+        let coeffs = coeffs_chunk.values;
+
+        let coeffs_by_8 = coeffs.chunks_exact(8);
+        let reminder = coeffs_by_8.remainder();
+
+        for k in coeffs_by_8 {
+            let mmk0 = wasm32_utils::ptr_i16_to_set1_i64(k, 0);
+            let mmk1 = wasm32_utils::ptr_i16_to_set1_i64(k, 4);
+
+            for i in 0..4 {
+                let source = wasm32_utils::load_v128(src_rows[i], x);
+                let pix = i8x16_swizzle(source, SH1);
+                let tmp_sum = i32x4_add(sss[i], i32x4_dot_i16x8(pix, mmk0));
+                let pix = i8x16_swizzle(source, SH2);
+                sss[i] = i32x4_add(tmp_sum, i32x4_dot_i16x8(pix, mmk1));
+            }
+            x += 8;
+        }
+
+        let coeffs_by_4 = reminder.chunks_exact(4);
+        let reminder = coeffs_by_4.remainder();
+
+        for k in coeffs_by_4 {
+            let mmk = wasm32_utils::ptr_i16_to_set1_i64(k, 0);
+
+            for i in 0..4 {
+                let source = wasm32_utils::loadl_i64(src_rows[i], x);
+                let pix = i8x16_swizzle(source, SH1);
+                sss[i] = i32x4_add(sss[i], i32x4_dot_i16x8(pix, mmk));
+            }
+            x += 4;
+        }
+
+        let coeffs_by_2 = reminder.chunks_exact(2);
+        let reminder = coeffs_by_2.remainder();
+
+        for k in coeffs_by_2 {
+            let mmk = wasm32_utils::ptr_i16_to_set1_i32(k, 0);
+
+            for i in 0..4 {
+                let source = wasm32_utils::loadl_i32(src_rows[i], x);
+                let pix = i8x16_swizzle(source, SH1);
+                sss[i] = i32x4_add(sss[i], i32x4_dot_i16x8(pix, mmk));
+            }
+            x += 2;
+        }
+
+        if let Some(&k) = reminder.first() {
+            let mmk = i32x4_splat(k as i32);
+
+            for i in 0..4 {
+                let source = wasm32_utils::loadl_i16(src_rows[i], x);
+                let pix = i8x16_swizzle(source, SH1);
+                sss[i] = i32x4_add(sss[i], i32x4_dot_i16x8(pix, mmk));
+            }
+        }
+
+        for i in 0..4 {
+            set_dst_pixel(sss[i], dst_rows[i], dst_x, normalizer);
+        }
+    }
+}
+
+#[inline]
+#[target_feature(enable = "simd128")]
+unsafe fn set_dst_pixel(
+    raw: v128,
+    d_row: &mut &mut [U8x2],
+    dst_x: usize,
+    normalizer: &optimisations::Normalizer16,
+) {
+    let l32x2 = i64x2_extract_lane::<0>(raw);
+    let a32x2 = i64x2_extract_lane::<1>(raw);
+    let l32 = ((l32x2 >> 32) as i32).saturating_add((l32x2 & 0xffffffff) as i32);
+    let a32 = ((a32x2 >> 32) as i32).saturating_add((a32x2 & 0xffffffff) as i32);
+    let l8 = normalizer.clip(l32);
+    let a8 = normalizer.clip(a32);
+    d_row.get_unchecked_mut(dst_x).0 = u16::from_le_bytes([l8, a8]);
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - bounds.len() == dst_row.len()
+/// - coeffs.len() == dst_rows.0.len() * window_size
+/// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[inline]
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_one_row(
+    src_row: &[U8x2],
+    dst_row: &mut [U8x2],
+    coefficients_chunks: &[optimisations::CoefficientsI16Chunk],
+    normalizer: &optimisations::Normalizer16,
+) {
+    let precision = normalizer.precision();
+    /*
+       |L  A | |L  A | |L  A | |L  A | |L  A | |L  A | |L  A | |L  A |
+       |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15|
+
+       Scale first four pixels into i16:
+
+       A: |-1 07| |-1 05|
+       L: |-1 06| |-1 04|
+       A: |-1 03| |-1 01|
+       L: |-1 02| |-1 00|
+    */
+    #[rustfmt::skip]
+    const PIX_SH1: v128 = i8x16(
+        0, -1, 2, -1, 1, -1, 3, -1, 4, -1, 6, -1, 5, -1, 7, -1
+    );
+    /*
+       |C0   | |C1   | |C2   | |C3   | |C4   | |C5   | |C6   | |C7   |
+       |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15|
+
+       Duplicate first four coefficients for A and L components of pixels:
+
+       CA: |07 06| |05 04|
+       CL: |07 06| |05 04|
+       CA: |03 02| |01 00|
+       CL: |03 02| |01 00|
+    */
+    #[rustfmt::skip]
+    const COEFF_SH1: v128 = i8x16(
+        0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7
+    );
+
+    /*
+       |L  A | |L  A | |L  A | |L  A | |L  A | |L  A | |L  A | |L  A |
+       |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15|
+
+       Scale second four pixels into i16:
+
+       A: |-1 15| |-1 13|
+       L: |-1 14| |-1 12|
+       A: |-1 11| |-1 09|
+       L: |-1 10| |-1 08|
+    */
+    #[rustfmt::skip]
+    const PIX_SH2: v128 = i8x16(
+        8, -1, 10, -1, 9, -1, 11, -1, 12, -1, 14, -1, 13, -1, 15, -1
+    );
+    /*
+       |C0   | |C1   | |C2   | |C3   | |C4   | |C5   | |C6   | |C7   |
+       |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15|
+
+       Duplicate second four coefficients for A and L components of pixels:
+
+       CA: |15 14| |13 12|
+       CL: |15 14| |13 12|
+       CA: |11 10| |09 08|
+       CL: |11 10| |09 08|
+    */
+    #[rustfmt::skip]
+    const COEFF_SH2: v128 = i8x16(
+        8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15, 12, 13, 14, 15
+    );
+
+    /*
+       |L  A | |L  A | |L  A | |L  A |
+       |00 01| |02 03| |04 05| |06 07| |08 09| |10 11| |12 13| |14 15|
+
+       Scale four pixels into i16:
+
+       A: |-1 07| |-1 05|
+       L: |-1 06| |-1 04|
+       A: |-1 03| |-1 01|
+       L: |-1 02| |-1 00|
+    */
+    const PIX_SH3: v128 = i8x16(0, -1, 2, -1, 1, -1, 3, -1, 4, -1, 6, -1, 5, -1, 7, -1);
+
+    for (dst_x, &coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x = coeffs_chunk.start as usize;
+        let mut coeffs = coeffs_chunk.values;
+
+        // Lower part will be added to higher, use only half of the error
+        let mut sss = i32x4_splat(1 << (precision - 2));
+
+        let coeffs_by_8 = coeffs.chunks_exact(8);
+        coeffs = coeffs_by_8.remainder();
+
+        for k in coeffs_by_8 {
+            let ksource = wasm32_utils::load_v128(k, 0);
+            let source = wasm32_utils::load_v128(src_row, x);
+
+            let pix = i8x16_swizzle(source, PIX_SH1);
+            let mmk = i8x16_swizzle(ksource, COEFF_SH1);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            let pix = i8x16_swizzle(source, PIX_SH2);
+            let mmk = i8x16_swizzle(ksource, COEFF_SH2);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            x += 8;
+        }
+
+        let coeffs_by_4 = coeffs.chunks_exact(4);
+        let reminder1 = coeffs_by_4.remainder();
+
+        for k in coeffs_by_4 {
+            let mmk = i16x8(k[0], k[1], k[0], k[1], k[2], k[3], k[2], k[3]);
+            let source = wasm32_utils::loadl_i64(src_row, x);
+            let pix = i8x16_swizzle(source, PIX_SH3);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            x += 4
+        }
+
+        if !reminder1.is_empty() {
+            let mut pixels: [i16; 6] = [0; 6];
+            let mut coeffs: [i16; 3] = [0; 3];
+            for (i, &coeff) in reminder1.iter().enumerate() {
+                coeffs[i] = coeff;
+                let pixel: [u8; 2] = (*src_row.get_unchecked(x)).0.to_le_bytes();
+                pixels[i * 2] = pixel[0] as i16;
+                pixels[i * 2 + 1] = pixel[1] as i16;
+                x += 1;
+            }
+
+            let pix = i16x8(
+                pixels[0], pixels[2], pixels[1], pixels[3], pixels[4], 0, pixels[5], 0,
+            );
+            let mmk = i16x8(
+                coeffs[0], coeffs[1], coeffs[0], coeffs[1], coeffs[2], 0, coeffs[2], 0,
+            );
+
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+        }
+
+        let lo = i64x2_extract_lane::<0>(sss);
+        let hi = i64x2_extract_lane::<1>(sss);
+
+        let a32 = ((lo >> 32) as i32).saturating_add((hi >> 32) as i32);
+        let l32 = ((lo & 0xffffffff) as i32).saturating_add((hi & 0xffffffff) as i32);
+        let a8 = normalizer.clip(a32);
+        let l8 = normalizer.clip(l32);
+        dst_row.get_unchecked_mut(dst_x).0 = u16::from_le_bytes([l8, a8]);
+    }
+}
diff --git a/src/convolution/u8x3/mod.rs b/src/convolution/u8x3/mod.rs
index 01160de..249b6e3 100644
--- a/src/convolution/u8x3/mod.rs
+++ b/src/convolution/u8x3/mod.rs
@@ -12,6 +12,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl Convolution for U8x3 {
     fn horiz_convolution(
@@ -28,6 +30,10 @@ impl Convolution for U8x3 {
             CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs),
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs),
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => {
+                wasm32::horiz_convolution(src_image, dst_image, offset, coeffs)
+            }
             _ => native::horiz_convolution(src_image, dst_image, offset, coeffs),
         }
     }
diff --git a/src/convolution/u8x3/wasm32.rs b/src/convolution/u8x3/wasm32.rs
new file mode 100644
index 0000000..9d0e08d
--- /dev/null
+++ b/src/convolution/u8x3/wasm32.rs
@@ -0,0 +1,292 @@
+use std::arch::wasm32::*;
+use std::intrinsics::transmute;
+
+use crate::convolution::{optimisations, Coefficients};
+use crate::pixels::U8x3;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+#[inline]
+pub(crate) fn horiz_convolution(
+    src_image: &ImageView<U8x3>,
+    dst_image: &mut ImageViewMut<U8x3>,
+    offset: u32,
+    coeffs: Coefficients,
+) {
+    let normalizer = optimisations::Normalizer16::new(coeffs);
+    let precision = normalizer.precision();
+    let coefficients_chunks = normalizer.normalized_chunks();
+    let dst_height = dst_image.height().get();
+
+    let src_iter = src_image.iter_4_rows(offset, dst_height + offset);
+    let dst_iter = dst_image.iter_4_rows_mut();
+    for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
+        unsafe {
+            horiz_convolution_8u4x(src_rows, dst_rows, &coefficients_chunks, precision);
+        }
+    }
+
+    let mut yy = dst_height - dst_height % 4;
+    while yy < dst_height {
+        unsafe {
+            horiz_convolution_8u(
+                src_image.get_row(yy + offset).unwrap(),
+                dst_image.get_row_mut(yy).unwrap(),
+                &coefficients_chunks,
+                precision,
+            );
+        }
+        yy += 1;
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - length of all rows in src_rows must be equal
+/// - length of all rows in dst_rows must be equal
+/// - coefficients_chunks.len() == dst_rows.0.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[inline]
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_8u4x(
+    src_rows: [&[U8x3]; 4],
+    dst_rows: [&mut &mut [U8x3]; 4],
+    coefficients_chunks: &[optimisations::CoefficientsI16Chunk],
+    precision: u8,
+) {
+    const ZERO: v128 = i64x2(0, 0);
+    let initial = i32x4_splat(1 << (precision - 1));
+    let src_width = src_rows[0].len();
+
+    /*
+        |R  G  B | |R  G  B | |R  G  B | |R  G  B | |R  G  B | |R |
+        |00 01 02| |03 04 05| |06 07 08| |09 10 11| |12 13 14| |15|
+
+        Ignore 12-15 bytes in register and
+        shuffle other components with converting from u8 into i16:
+
+        x: |-1 -1| |-1 -1|
+        B: |-1 05| |-1 02|
+        G: |-1 04| |-1 01|
+        R: |-1 03| |-1 00|
+    */
+    #[rustfmt::skip]
+    const SH_LO: v128 = i8x16(
+        0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1
+    );
+    /*
+        x: |-1 -1| |-1 -1|
+        B: |-1 11| |-1 08|
+        G: |-1 10| |-1 07|
+        R: |-1 09| |-1 06|
+    */
+    #[rustfmt::skip]
+    const SH_HI: v128 = i8x16(
+        6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1
+    );
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let x_start = coeffs_chunk.start as usize;
+        let mut x = x_start;
+
+        let mut sss_a = [initial; 4];
+        let mut coeffs = coeffs_chunk.values;
+
+        // Next block of code will be load source pixels by 16 bytes per time.
+        // We must guarantee what this process will not go beyond
+        // the one row of image.
+        // (16 bytes) / (3 bytes per pixel) = 5 whole pixels + 1 byte
+        let max_x = src_width.saturating_sub(5);
+        if x < max_x {
+            let coeffs_by_4 = coeffs.chunks_exact(4);
+
+            for k in coeffs_by_4 {
+                let mmk0 = wasm32_utils::ptr_i16_to_set1_i32(k, 0);
+                let mmk1 = wasm32_utils::ptr_i16_to_set1_i32(k, 2);
+                for i in 0..4 {
+                    let source = wasm32_utils::load_v128(src_rows[i], x);
+                    let pix = i8x16_swizzle(source, SH_LO);
+                    let mut sss = sss_a[i];
+                    sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk0));
+                    let pix = i8x16_swizzle(source, SH_HI);
+                    sss_a[i] = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk1));
+                }
+
+                x += 4;
+                if x >= max_x {
+                    break;
+                }
+            }
+        }
+
+        // Next block of code will be load source pixels by 8 bytes per time.
+        // We must guarantee what this process will not go beyond
+        // the one row of image.
+        // (8 bytes) / (3 bytes per pixel) = 2 whole pixels + 2 bytes
+        let max_x = src_width.saturating_sub(2);
+        if x < max_x {
+            let coeffs_by_2 = coeffs[x - x_start..].chunks_exact(2);
+
+            for k in coeffs_by_2 {
+                let mmk = wasm32_utils::ptr_i16_to_set1_i32(k, 0);
+
+                for i in 0..4 {
+                    let source = wasm32_utils::loadl_i64(src_rows[i], x);
+                    let pix = i8x16_swizzle(source, SH_LO);
+                    sss_a[i] = i32x4_add(sss_a[i], i32x4_dot_i16x8(pix, mmk));
+                }
+
+                x += 2;
+                if x >= max_x {
+                    break;
+                }
+            }
+        }
+
+        coeffs = coeffs.split_at(x - x_start).1;
+        for &k in coeffs {
+            let mmk = i32x4_splat(k as i32);
+            for i in 0..4 {
+                let pix = wasm32_utils::i32x4_extend_low_ptr_u8x3(src_rows[i], x);
+                sss_a[i] = i32x4_add(sss_a[i], i32x4_dot_i16x8(pix, mmk));
+            }
+
+            x += 1;
+        }
+        macro_rules! call {
+            ($imm8:expr) => {{
+                sss_a[0] = i32x4_shr(sss_a[0], $imm8);
+                sss_a[1] = i32x4_shr(sss_a[1], $imm8);
+                sss_a[2] = i32x4_shr(sss_a[2], $imm8);
+                sss_a[3] = i32x4_shr(sss_a[3], $imm8);
+            }};
+        }
+        constify_imm8!(precision, call);
+
+        for i in 0..4 {
+            let sss = i16x8_narrow_i32x4(sss_a[i], ZERO);
+            let pixel: u32 = transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss, ZERO)));
+            let bytes = pixel.to_le_bytes();
+            dst_rows[i].get_unchecked_mut(dst_x).0 = [bytes[0], bytes[1], bytes[2]];
+        }
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - bounds.len() == dst_row.len()
+/// - coeffs.len() == dst_rows.0.len() * window_size
+/// - max(bound.start + bound.size for bound in bounds) <= src_row.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[inline]
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_8u(
+    src_row: &[U8x3],
+    dst_row: &mut [U8x3],
+    coefficients_chunks: &[optimisations::CoefficientsI16Chunk],
+    precision: u8,
+) {
+    #[rustfmt::skip]
+    const PIX_SH1: v128 = i8x16(
+        0, -1, 3, -1, 1, -1, 4, -1, 2, -1, 5, -1, -1, -1, -1, -1
+    );
+    #[rustfmt::skip]
+    const COEF_SH1: v128 = i8x16(
+        0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+    );
+    #[rustfmt::skip]
+    const PIX_SH2: v128 = i8x16(
+        6, -1, 9, -1, 7, -1, 10, -1, 8, -1, 11, -1, -1, -1, -1, -1
+    );
+    #[rustfmt::skip]
+    const COEF_SH2: v128 = i8x16(
+        4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+    );
+    /*
+        Load 8 bytes from memory into low half of 16-bytes register:
+        |R  G  B | |R  G  B | |R  G |
+        |00 01 02| |03 04 05| |06 07| 08 09 10 11 12 13 14 15
+
+        Ignore 06-16 bytes in 16-bytes register and
+        shuffle other components with converting from u8 into i16:
+
+        x: |-1 -1| |-1 -1|
+        B: |-1 05| |-1 02|
+        G: |-1 04| |-1 01|
+        R: |-1 03| |-1 00|
+    */
+    let src_width = src_row.len();
+
+    for (dst_x, &coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let x_start = coeffs_chunk.start as usize;
+        let mut x = x_start;
+        let mut coeffs = coeffs_chunk.values;
+        let mut sss = i32x4_splat(1 << (precision - 1));
+
+        // Next block of code will be load source pixels by 16 bytes per time.
+        // We must guarantee what this process will not go beyond
+        // the one row of image.
+        // (16 bytes) / (3 bytes per pixel) = 5 whole pixels + 1 bytes
+        let max_x = src_width.saturating_sub(5);
+        if x < max_x {
+            let coeffs_by_4 = coeffs.chunks_exact(4);
+            for k in coeffs_by_4 {
+                let ksource = wasm32_utils::loadl_i64(k, 0);
+                let source = wasm32_utils::load_v128(src_row, x);
+
+                let pix = i8x16_swizzle(source, PIX_SH1);
+                let mmk = i8x16_swizzle(ksource, COEF_SH1);
+                sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+                let pix = i8x16_swizzle(source, PIX_SH2);
+                let mmk = i8x16_swizzle(ksource, COEF_SH2);
+                sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+                x += 4;
+                if x >= max_x {
+                    break;
+                }
+            }
+        }
+
+        // Next block of code will be load source pixels by 8 bytes per time.
+        // We must guarantee what this process will not go beyond
+        // the one row of image.
+        // (8 bytes) / (3 bytes per pixel) = 2 whole pixels + 2 bytes
+        let max_x = src_width.saturating_sub(2);
+        if x < max_x {
+            let coeffs_by_2 = coeffs[x - x_start..].chunks_exact(2);
+
+            for k in coeffs_by_2 {
+                let mmk = wasm32_utils::ptr_i16_to_set1_i32(k, 0);
+                let source = wasm32_utils::loadl_i64(src_row, x);
+                let pix = i8x16_swizzle(source, PIX_SH1);
+                sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+                x += 2;
+                if x >= max_x {
+                    break;
+                }
+            }
+        }
+
+        coeffs = coeffs.split_at(x - x_start).1;
+        for &k in coeffs {
+            let pix = wasm32_utils::i32x4_extend_low_ptr_u8x3(src_row, x);
+            let mmk = i32x4_splat(k as i32);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+            x += 1;
+        }
+
+        macro_rules! call {
+            ($imm8:expr) => {{
+                sss = i32x4_shr(sss, $imm8);
+            }};
+        }
+        constify_imm8!(precision, call);
+
+        sss = i16x8_narrow_i32x4(sss, sss);
+        let pixel: u32 = transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss, sss)));
+        let bytes = pixel.to_le_bytes();
+        dst_row.get_unchecked_mut(dst_x).0 = [bytes[0], bytes[1], bytes[2]];
+    }
+}
diff --git a/src/convolution/u8x4/mod.rs b/src/convolution/u8x4/mod.rs
index ae37998..69a9dc3 100644
--- a/src/convolution/u8x4/mod.rs
+++ b/src/convolution/u8x4/mod.rs
@@ -12,6 +12,8 @@ mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 mod sse4;
+#[cfg(target_arch = "wasm32")]
+mod wasm32;
 
 impl Convolution for U8x4 {
     fn horiz_convolution(
@@ -28,6 +30,10 @@ impl Convolution for U8x4 {
             CpuExtensions::Sse4_1 => sse4::horiz_convolution(src_image, dst_image, offset, coeffs),
             #[cfg(target_arch = "aarch64")]
             CpuExtensions::Neon => neon::horiz_convolution(src_image, dst_image, offset, coeffs),
+            #[cfg(target_arch = "wasm32")]
+            CpuExtensions::Wasm32 => {
+                wasm32::horiz_convolution(src_image, dst_image, offset, coeffs)
+            }
             _ => native::horiz_convolution(src_image, dst_image, offset, coeffs),
         }
     }
diff --git a/src/convolution/u8x4/wasm32.rs b/src/convolution/u8x4/wasm32.rs
new file mode 100644
index 0000000..4651b8b
--- /dev/null
+++ b/src/convolution/u8x4/wasm32.rs
@@ -0,0 +1,282 @@
+use std::arch::wasm32::*;
+use std::intrinsics::transmute;
+
+use crate::convolution::{optimisations, Coefficients};
+use crate::pixels::U8x4;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+// This code is based on C-implementation from Pillow-SIMD package for Python
+// https://github.com/uploadcare/pillow-simd
+
+#[inline]
+pub(crate) fn horiz_convolution(
+    src_image: &ImageView<U8x4>,
+    dst_image: &mut ImageViewMut<U8x4>,
+    offset: u32,
+    coeffs: Coefficients,
+) {
+    let normalizer = optimisations::Normalizer16::new(coeffs);
+    let precision = normalizer.precision();
+    let coefficients_chunks = normalizer.normalized_chunks();
+    let dst_height = dst_image.height().get();
+
+    let src_iter = src_image.iter_4_rows(offset, dst_height + offset);
+    let dst_iter = dst_image.iter_4_rows_mut();
+    for (src_rows, dst_rows) in src_iter.zip(dst_iter) {
+        unsafe {
+            horiz_convolution_8u4x(src_rows, dst_rows, &coefficients_chunks, precision);
+        }
+    }
+
+    let mut yy = dst_height - dst_height % 4;
+    while yy < dst_height {
+        unsafe {
+            horiz_convolution_8u(
+                src_image.get_row(yy + offset).unwrap(),
+                dst_image.get_row_mut(yy).unwrap(),
+                &coefficients_chunks,
+                precision,
+            );
+        }
+        yy += 1;
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - length of all rows in src_rows must be equal
+/// - length of all rows in dst_rows must be equal
+/// - coefficients_chunks.len() == dst_rows.0.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.0.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_8u4x(
+    src_rows: [&[U8x4]; 4],
+    dst_rows: [&mut &mut [U8x4]; 4],
+    coefficients_chunks: &[optimisations::CoefficientsI16Chunk],
+    precision: u8,
+) {
+    let initial = i32x4_splat(1 << (precision - 1));
+    const MASK_LO: v128 = i8x16(0, -1, 4, -1, 1, -1, 5, -1, 2, -1, 6, -1, 3, -1, 7, -1);
+    const MASK_HI: v128 = i8x16(8, -1, 12, -1, 9, -1, 13, -1, 10, -1, 14, -1, 11, -1, 15, -1);
+    const MASK: v128 = i8x16(0, -1, 4, -1, 1, -1, 5, -1, 2, -1, 6, -1, 3, -1, 7, -1);
+
+    for (dst_x, coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x: usize = coeffs_chunk.start as usize;
+
+        let mut sss0 = initial;
+        let mut sss1 = initial;
+        let mut sss2 = initial;
+        let mut sss3 = initial;
+
+        let coeffs = coeffs_chunk.values;
+        let coeffs_by_4 = coeffs.chunks_exact(4);
+        let reminder1 = coeffs_by_4.remainder();
+
+        for k in coeffs_by_4 {
+            let mmk_lo = wasm32_utils::ptr_i16_to_set1_i32(k, 0);
+            let mmk_hi = wasm32_utils::ptr_i16_to_set1_i32(k, 2);
+
+            // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
+            let mut source = wasm32_utils::load_v128(src_rows[0], x);
+            // [16] a1 a0 b1 b0 g1 g0 r1 r0
+            let mut pix = i8x16_swizzle(source, MASK_LO);
+            sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk_lo));
+            // [16] a3 a2 b3 b2 g3 g2 r3 r2
+            pix = i8x16_swizzle(source, MASK_HI);
+            sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk_hi));
+
+            source = wasm32_utils::load_v128(src_rows[1], x);
+            pix = i8x16_swizzle(source, MASK_LO);
+            sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk_lo));
+            pix = i8x16_swizzle(source, MASK_HI);
+            sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk_hi));
+
+            source = wasm32_utils::load_v128(src_rows[2], x);
+            pix = i8x16_swizzle(source, MASK_LO);
+            sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk_lo));
+            pix = i8x16_swizzle(source, MASK_HI);
+            sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk_hi));
+
+            source = wasm32_utils::load_v128(src_rows[3], x);
+            pix = i8x16_swizzle(source, MASK_LO);
+            sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk_lo));
+            pix = i8x16_swizzle(source, MASK_HI);
+            sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk_hi));
+            x += 4;
+        }
+
+        let coeffs_by_2 = reminder1.chunks_exact(2);
+        let reminder2 = coeffs_by_2.remainder();
+
+        for k in coeffs_by_2 {
+            // [16] k1 k0 k1 k0 k1 k0 k1 k0
+            let mmk = wasm32_utils::ptr_i16_to_set1_i32(k, 0);
+
+            // [8] x x x x x x x x a1 b1 g1 r1 a0 b0 g0 r0
+            let mut pix = wasm32_utils::loadl_i64(src_rows[0], x);
+            // [16] a1 a0 b1 b0 g1 g0 r1 r0
+            pix = i8x16_swizzle(pix, MASK);
+            sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk));
+
+            pix = wasm32_utils::loadl_i64(src_rows[1], x);
+            pix = i8x16_swizzle(pix, MASK);
+            sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk));
+
+            pix = wasm32_utils::loadl_i64(src_rows[2], x);
+            pix = i8x16_swizzle(pix, MASK);
+            sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk));
+
+            pix = wasm32_utils::loadl_i64(src_rows[3], x);
+            pix = i8x16_swizzle(pix, MASK);
+            sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk));
+
+            x += 2;
+        }
+
+        if let Some(&k) = reminder2.first() {
+            // [16] xx k0 xx k0 xx k0 xx k0
+            let mmk = i32x4_splat(k as i32);
+            // [16] xx a0 xx b0 xx g0 xx r0
+            let mut pix = wasm32_utils::i32x4_extend_low_ptr_u8x4(src_rows[0], x);
+            sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk));
+
+            pix = wasm32_utils::i32x4_extend_low_ptr_u8x4(src_rows[1], x);
+            sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk));
+
+            pix = wasm32_utils::i32x4_extend_low_ptr_u8x4(src_rows[2], x);
+            sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk));
+
+            pix = wasm32_utils::i32x4_extend_low_ptr_u8x4(src_rows[3], x);
+            sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk));
+        }
+
+        macro_rules! call {
+            ($imm8:expr) => {{
+                sss0 = i32x4_shr(sss0, $imm8);
+                sss1 = i32x4_shr(sss1, $imm8);
+                sss2 = i32x4_shr(sss2, $imm8);
+                sss3 = i32x4_shr(sss3, $imm8);
+            }};
+        }
+        constify_imm8!(precision, call);
+
+        sss0 = i16x8_narrow_i32x4(sss0, sss0);
+        sss1 = i16x8_narrow_i32x4(sss1, sss1);
+        sss2 = i16x8_narrow_i32x4(sss2, sss2);
+        sss3 = i16x8_narrow_i32x4(sss3, sss3);
+        *dst_rows[0].get_unchecked_mut(dst_x) =
+            transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss0, sss0)));
+        *dst_rows[1].get_unchecked_mut(dst_x) =
+            transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss1, sss1)));
+        *dst_rows[2].get_unchecked_mut(dst_x) =
+            transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss2, sss2)));
+        *dst_rows[3].get_unchecked_mut(dst_x) =
+            transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss3, sss3)));
+    }
+}
+
+/// For safety, it is necessary to ensure the following conditions:
+/// - bounds.len() == dst_row.len()
+/// - coefficients_chunks.len() == dst_row.len()
+/// - max(chunk.start + chunk.values.len() for chunk in coefficients_chunks) <= src_row.len()
+/// - precision <= MAX_COEFS_PRECISION
+#[target_feature(enable = "simd128")]
+unsafe fn horiz_convolution_8u(
+    src_row: &[U8x4],
+    dst_row: &mut [U8x4],
+    coefficients_chunks: &[optimisations::CoefficientsI16Chunk],
+    precision: u8,
+) {
+    let initial = i32x4_splat(1 << (precision - 1));
+    const SH1: v128 = i8x16(0, -1, 8, -1, 1, -1, 9, -1, 2, -1, 10, -1, 3, -1, 11, -1);
+    const SH2: v128 = i8x16(0, 1, 4, 5, 0, 1, 4, 5, 0, 1, 4, 5, 0, 1, 4, 5);
+    const SH3: v128 = i8x16(4, -1, 12, -1, 5, -1, 13, -1, 6, -1, 14, -1, 7, -1, 15, -1);
+    const SH4: v128 = i8x16(2, 3, 6, 7, 2, 3, 6, 7, 2, 3, 6, 7, 2, 3, 6, 7);
+    const SH5: v128 = i8x16(8, 9, 12, 13, 8, 9, 12, 13, 8, 9, 12, 13, 8, 9, 12, 13);
+    const SH6: v128 = i8x16(
+        10, 11, 14, 15, 10, 11, 14, 15, 10, 11, 14, 15, 10, 11, 14, 15,
+    );
+    const SH7: v128 = i8x16(0, -1, 4, -1, 1, -1, 5, -1, 2, -1, 6, -1, 3, -1, 7, -1);
+
+    for (dst_x, &coeffs_chunk) in coefficients_chunks.iter().enumerate() {
+        let mut x: usize = coeffs_chunk.start as usize;
+        let mut sss = initial;
+
+        let coeffs_by_8 = coeffs_chunk.values.chunks_exact(8);
+        let reminder8 = coeffs_by_8.remainder();
+
+        for k in coeffs_by_8 {
+            let ksource = wasm32_utils::load_v128(k, 0);
+
+            let mut source = wasm32_utils::load_v128(src_row, x);
+
+            let mut pix = i8x16_swizzle(source, SH1);
+            let mut mmk = i8x16_swizzle(ksource, SH2);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            pix = i8x16_swizzle(source, SH3);
+            mmk = i8x16_swizzle(ksource, SH4);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            source = wasm32_utils::load_v128(src_row, x + 4);
+
+            pix = i8x16_swizzle(source, SH1);
+            mmk = i8x16_swizzle(ksource, SH5);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            pix = i8x16_swizzle(source, SH3);
+            mmk = i8x16_swizzle(ksource, SH6);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            x += 8;
+        }
+
+        let coeffs_by_4 = reminder8.chunks_exact(4);
+        let reminder4 = coeffs_by_4.remainder();
+
+        for k in coeffs_by_4 {
+            let source = wasm32_utils::load_v128(src_row, x);
+            let ksource = wasm32_utils::loadl_i64(k, 0);
+
+            let mut pix = i8x16_swizzle(source, SH1);
+            let mut mmk = i8x16_swizzle(ksource, SH2);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            pix = i8x16_swizzle(source, SH3);
+            mmk = i8x16_swizzle(ksource, SH4);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            x += 4;
+        }
+
+        let coeffs_by_2 = reminder4.chunks_exact(2);
+        let reminder2 = coeffs_by_2.remainder();
+
+        for k in coeffs_by_2 {
+            let mmk = wasm32_utils::ptr_i16_to_set1_i32(k, 0);
+            let source = wasm32_utils::loadl_i64(src_row, x);
+            let pix = i8x16_swizzle(source, SH7);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            x += 2
+        }
+
+        if let Some(&k) = reminder2.first() {
+            let pix = wasm32_utils::i32x4_extend_low_ptr_u8x4(src_row, x);
+            let mmk = i32x4_splat(k as i32);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+        }
+
+        macro_rules! call {
+            ($imm8:expr) => {{
+                sss = i32x4_shr(sss, $imm8);
+            }};
+        }
+        constify_imm8!(precision, call);
+
+        sss = i16x8_narrow_i32x4(sss, sss);
+        *dst_row.get_unchecked_mut(dst_x) =
+            transmute(i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss, sss)));
+    }
+}
diff --git a/src/convolution/vertical_u16/mod.rs b/src/convolution/vertical_u16/mod.rs
index ec581d6..7064a1c 100644
--- a/src/convolution/vertical_u16/mod.rs
+++ b/src/convolution/vertical_u16/mod.rs
@@ -10,6 +10,8 @@ pub(crate) mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 pub(crate) mod sse4;
+#[cfg(target_arch = "wasm32")]
+pub(crate) mod wasm32;
 
 pub(crate) fn vert_convolution_u16<T: PixelExt<Component = u16>>(
     src_image: &ImageView<T>,
@@ -29,6 +31,8 @@ pub(crate) fn vert_convolution_u16<T: PixelExt<Component = u16>>(
         CpuExtensions::Sse4_1 => sse4::vert_convolution(src_image, dst_image, offset, coeffs),
         #[cfg(target_arch = "aarch64")]
         CpuExtensions::Neon => neon::vert_convolution(src_image, dst_image, offset, coeffs),
+        #[cfg(target_arch = "wasm32")]
+        CpuExtensions::Wasm32 => wasm32::vert_convolution(src_image, dst_image, offset, coeffs),
         _ => native::vert_convolution(src_image, dst_image, offset, coeffs),
     }
 }
diff --git a/src/convolution/vertical_u16/wasm32.rs b/src/convolution/vertical_u16/wasm32.rs
new file mode 100644
index 0000000..4f3c457
--- /dev/null
+++ b/src/convolution/vertical_u16/wasm32.rs
@@ -0,0 +1,238 @@
+use std::arch::wasm32::*;
+
+use crate::convolution::optimisations::CoefficientsI32Chunk;
+use crate::convolution::vertical_u16::native::convolution_by_u16;
+use crate::convolution::{optimisations, Coefficients};
+use crate::pixels::PixelExt;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+pub(crate) fn vert_convolution<T: PixelExt<Component = u16>>(
+    src_image: &ImageView<T>,
+    dst_image: &mut ImageViewMut<T>,
+    offset: u32,
+    coeffs: Coefficients,
+) {
+    let normalizer = optimisations::Normalizer32::new(coeffs);
+    let coefficients_chunks = normalizer.normalized_chunks();
+    let src_x = offset as usize * T::count_of_components();
+
+    let dst_rows = dst_image.iter_rows_mut();
+    for (dst_row, coeffs_chunk) in dst_rows.zip(coefficients_chunks) {
+        unsafe {
+            vert_convolution_into_one_row_u16(src_image, dst_row, src_x, coeffs_chunk, &normalizer);
+        }
+    }
+}
+
+#[target_feature(enable = "simd128")]
+unsafe fn vert_convolution_into_one_row_u16<T: PixelExt<Component = u16>>(
+    src_img: &ImageView<T>,
+    dst_row: &mut [T],
+    mut src_x: usize,
+    coeffs_chunk: CoefficientsI32Chunk,
+    normalizer: &optimisations::Normalizer32,
+) {
+    let y_start = coeffs_chunk.start;
+    let coeffs = coeffs_chunk.values;
+    let max_y = y_start + coeffs.len() as u32;
+    let mut dst_u16 = T::components_mut(dst_row);
+
+    /*
+        |0    1    2    3    4    5    6    7   |
+        |0001 0203 0405 0607 0809 1011 1213 1415|
+
+        Shuffle to extract 0-1 components as i64:
+        0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract 2-3 components as i64:
+        4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract 4-5 components as i64:
+        8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1
+
+        Shuffle to extract 6-7 components as i64:
+        12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1
+
+    */
+
+    let c_shuffles = [
+        i8x16(0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1),
+        i8x16(4, 5, -1, -1, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, -1, -1),
+        i8x16(8, 9, -1, -1, -1, -1, -1, -1, 10, 11, -1, -1, -1, -1, -1, -1),
+        i8x16(
+            12, 13, -1, -1, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, -1, -1,
+        ),
+    ];
+
+    let precision = normalizer.precision();
+    let initial = i64x2_splat(1 << (precision - 1));
+    let mut c_buf = [0i64; 2];
+
+    let mut dst_chunks_16 = dst_u16.chunks_exact_mut(16);
+    for dst_chunk in &mut dst_chunks_16 {
+        let mut sums = [[initial; 2], [initial; 2], [initial; 2], [initial; 2]];
+
+        let mut y: u32 = 0;
+        let coeffs_2 = coeffs.chunks_exact(2);
+        let coeffs_reminder = coeffs_2.remainder();
+
+        for (src_rows, two_coeffs) in src_img.iter_2_rows(y_start, max_y).zip(coeffs_2) {
+            let src_rows = src_rows.map(|row| T::components(row));
+
+            for r in 0..2 {
+                let coeff_i64x2 = i64x2_splat(two_coeffs[r] as i64);
+                for x in 0..2 {
+                    let source = wasm32_utils::load_v128(src_rows[r], src_x + x * 8);
+                    for i in 0..4 {
+                        let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]);
+                        sums[i][x] =
+                            i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
+                    }
+                }
+            }
+            y += 2;
+        }
+
+        if let Some(&k) = coeffs_reminder.first() {
+            let s_row = src_img.get_row(y_start + y).unwrap();
+            let components = T::components(s_row);
+            let coeff_i64x2 = i64x2_splat(k as i64);
+
+            for x in 0..2 {
+                let source = wasm32_utils::load_v128(components, src_x + x * 8);
+                for i in 0..4 {
+                    let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]);
+                    sums[i][x] =
+                        i64x2_add(sums[i][x], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
+                }
+            }
+        }
+
+        let mut dst_ptr = dst_chunk.as_mut_ptr();
+        for x in 0..2 {
+            for sum in sums {
+                v128_store((&mut c_buf).as_mut_ptr() as *mut v128, sum[x]);
+                *dst_ptr = normalizer.clip(c_buf[0]);
+                dst_ptr = dst_ptr.add(1);
+                *dst_ptr = normalizer.clip(c_buf[1]);
+                dst_ptr = dst_ptr.add(1);
+            }
+        }
+
+        src_x += 16;
+    }
+
+    dst_u16 = dst_chunks_16.into_remainder();
+    let mut dst_chunks_8 = dst_u16.chunks_exact_mut(8);
+    if let Some(dst_chunk) = dst_chunks_8.next() {
+        let mut sums = [initial, initial, initial, initial];
+
+        let mut y: u32 = 0;
+        let coeffs_2 = coeffs.chunks_exact(2);
+        let coeffs_reminder = coeffs_2.remainder();
+
+        for (src_rows, two_coeffs) in src_img.iter_2_rows(y_start, max_y).zip(coeffs_2) {
+            let src_rows = src_rows.map(|row| T::components(row));
+            let coeffs_i64 = [
+                i64x2_splat(two_coeffs[0] as i64),
+                i64x2_splat(two_coeffs[1] as i64),
+            ];
+
+            for r in 0..2 {
+                let source = wasm32_utils::load_v128(src_rows[r], src_x);
+                for i in 0..4 {
+                    let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]);
+                    sums[i] =
+                        i64x2_add(sums[i], wasm32_utils::i64x2_mul_lo(c_i64x2, coeffs_i64[r]));
+                }
+            }
+            y += 2;
+        }
+
+        if let Some(&k) = coeffs_reminder.first() {
+            let s_row = src_img.get_row(y_start + y).unwrap();
+            let components = T::components(s_row);
+            let coeff_i64x2 = i64x2_splat(k as i64);
+            let source = wasm32_utils::load_v128(components, src_x);
+            for i in 0..4 {
+                let c_i64x2 = i8x16_swizzle(source, c_shuffles[i]);
+                sums[i] = i64x2_add(sums[i], wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
+            }
+        }
+
+        let mut dst_ptr = dst_chunk.as_mut_ptr();
+        for sum in sums {
+            // let mask = _mm_cmpgt_epi64(sums[i], zero);
+            // sums[i] = _mm_and_si128(sums[i] , mask);
+            // sums[i] = _mm_srl_epi64(sums[i] , precision_i64);
+            // _mm_packus_epi32(sums[i] , sums[i] );
+            v128_store((&mut c_buf).as_mut_ptr() as *mut v128, sum);
+            *dst_ptr = normalizer.clip(c_buf[0]);
+            dst_ptr = dst_ptr.add(1);
+            *dst_ptr = normalizer.clip(c_buf[1]);
+            dst_ptr = dst_ptr.add(1);
+        }
+
+        src_x += 8;
+    }
+
+    dst_u16 = dst_chunks_8.into_remainder();
+    let mut dst_chunks_4 = dst_u16.chunks_exact_mut(4);
+    if let Some(dst_chunk) = dst_chunks_4.next() {
+        let mut c01 = initial;
+        let mut c23 = initial;
+        let mut y: u32 = 0;
+        let coeffs_2 = coeffs.chunks_exact(2);
+        let coeffs_reminder = coeffs_2.remainder();
+
+        for (src_rows, two_coeffs) in src_img.iter_2_rows(y_start, max_y).zip(coeffs_2) {
+            let src_rows = src_rows.map(|row| T::components(row));
+            let coeffs_i64 = [
+                i64x2_splat(two_coeffs[0] as i64),
+                i64x2_splat(two_coeffs[1] as i64),
+            ];
+            for r in 0..2 {
+                let comp_x4 = src_rows[r].get_unchecked(src_x..src_x + 4);
+                let c_i64x2 = i64x2(comp_x4[0] as i64, comp_x4[1] as i64);
+                c01 = i64x2_add(c01, wasm32_utils::i64x2_mul_lo(c_i64x2, coeffs_i64[r]));
+                let c_i64x2 = i64x2(comp_x4[2] as i64, comp_x4[3] as i64);
+                c23 = i64x2_add(c23, wasm32_utils::i64x2_mul_lo(c_i64x2, coeffs_i64[r]));
+            }
+            y += 2;
+        }
+
+        if let Some(&k) = coeffs_reminder.first() {
+            let s_row = src_img.get_row(y_start + y).unwrap();
+            let components = T::components(s_row);
+            let coeff_i64x2 = i64x2_splat(k as i64);
+
+            let comp_x4 = components.get_unchecked(src_x..src_x + 4);
+            let c_i64x2 = i64x2(comp_x4[0] as i64, comp_x4[1] as i64);
+            c01 = i64x2_add(c01, wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
+            let c_i64x2 = i64x2(comp_x4[2] as i64, comp_x4[3] as i64);
+            c23 = i64x2_add(c23, wasm32_utils::i64x2_mul_lo(c_i64x2, coeff_i64x2));
+        }
+
+        let mut dst_ptr = dst_chunk.as_mut_ptr();
+        v128_store((&mut c_buf).as_mut_ptr() as *mut v128, c01);
+        *dst_ptr = normalizer.clip(c_buf[0]);
+        dst_ptr = dst_ptr.add(1);
+        *dst_ptr = normalizer.clip(c_buf[1]);
+        dst_ptr = dst_ptr.add(1);
+        v128_store((&mut c_buf).as_mut_ptr() as *mut v128, c23);
+        *dst_ptr = normalizer.clip(c_buf[0]);
+        dst_ptr = dst_ptr.add(1);
+        *dst_ptr = normalizer.clip(c_buf[1]);
+
+        src_x += 4;
+    }
+
+    dst_u16 = dst_chunks_4.into_remainder();
+    if !dst_u16.is_empty() {
+        let initial = 1 << (precision - 1);
+        convolution_by_u16(
+            src_img, normalizer, initial, dst_u16, src_x, y_start, coeffs,
+        );
+    }
+}
diff --git a/src/convolution/vertical_u8/mod.rs b/src/convolution/vertical_u8/mod.rs
index 5432fae..3d2c79e 100644
--- a/src/convolution/vertical_u8/mod.rs
+++ b/src/convolution/vertical_u8/mod.rs
@@ -10,6 +10,8 @@ pub(crate) mod native;
 mod neon;
 #[cfg(target_arch = "x86_64")]
 pub(crate) mod sse4;
+#[cfg(target_arch = "wasm32")]
+pub(crate) mod wasm32;
 
 pub(crate) fn vert_convolution_u8<T: PixelExt<Component = u8>>(
     src_image: &ImageView<T>,
@@ -29,6 +31,8 @@ pub(crate) fn vert_convolution_u8<T: PixelExt<Component = u8>>(
         CpuExtensions::Sse4_1 => sse4::vert_convolution(src_image, dst_image, offset, coeffs),
         #[cfg(target_arch = "aarch64")]
         CpuExtensions::Neon => neon::vert_convolution(src_image, dst_image, offset, coeffs),
+        #[cfg(target_arch = "wasm32")]
+        CpuExtensions::Wasm32 => wasm32::vert_convolution(src_image, dst_image, offset, coeffs),
         _ => native::vert_convolution(src_image, dst_image, offset, coeffs),
     }
 }
diff --git a/src/convolution/vertical_u8/wasm32.rs b/src/convolution/vertical_u8/wasm32.rs
new file mode 100644
index 0000000..bb2c698
--- /dev/null
+++ b/src/convolution/vertical_u8/wasm32.rs
@@ -0,0 +1,291 @@
+use std::arch::wasm32::*;
+
+use crate::convolution::vertical_u8::native;
+use crate::convolution::{optimisations, Coefficients};
+use crate::pixels::PixelExt;
+use crate::wasm32_utils;
+use crate::{ImageView, ImageViewMut};
+
+#[inline]
+pub(crate) fn vert_convolution<T: PixelExt<Component = u8>>(
+    src_image: &ImageView<T>,
+    dst_image: &mut ImageViewMut<T>,
+    offset: u32,
+    coeffs: Coefficients,
+) {
+    let normalizer = optimisations::Normalizer16::new(coeffs);
+    let coefficients_chunks = normalizer.normalized_chunks();
+    let src_x = offset as usize * T::count_of_components();
+
+    let dst_rows = dst_image.iter_rows_mut();
+    for (dst_row, coeffs_chunk) in dst_rows.zip(coefficients_chunks) {
+        unsafe {
+            vert_convolution_into_one_row_u8(src_image, dst_row, src_x, coeffs_chunk, &normalizer);
+        }
+    }
+}
+
+#[target_feature(enable = "simd128")]
+pub(crate) unsafe fn vert_convolution_into_one_row_u8<T: PixelExt<Component = u8>>(
+    src_img: &ImageView<T>,
+    dst_row: &mut [T],
+    mut src_x: usize,
+    coeffs_chunk: optimisations::CoefficientsI16Chunk,
+    normalizer: &optimisations::Normalizer16,
+) {
+    const ZERO: v128 = i64x2(0, 0);
+    let y_start = coeffs_chunk.start;
+    let coeffs = coeffs_chunk.values;
+    let max_y = y_start + coeffs.len() as u32;
+    let precision = normalizer.precision();
+    let mut dst_u8 = T::components_mut(dst_row);
+
+    let initial = i32x4_splat(1 << (precision - 1));
+
+    let mut dst_chunks_32 = dst_u8.chunks_exact_mut(32);
+    for dst_chunk in &mut dst_chunks_32 {
+        let mut sss0 = initial;
+        let mut sss1 = initial;
+        let mut sss2 = initial;
+        let mut sss3 = initial;
+        let mut sss4 = initial;
+        let mut sss5 = initial;
+        let mut sss6 = initial;
+        let mut sss7 = initial;
+
+        let mut y: u32 = 0;
+
+        for src_rows in src_img.iter_2_rows(y_start, max_y) {
+            let components1 = T::components(src_rows[0]);
+            let components2 = T::components(src_rows[1]);
+
+            // Load two coefficients at once
+            let mmk = wasm32_utils::ptr_i16_to_set1_i32(coeffs, y as usize);
+
+            let source1 = wasm32_utils::load_v128(components1, src_x); // top line
+            let source2 = wasm32_utils::load_v128(components2, src_x); // bottom line
+
+            let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+                source1, source2,
+            );
+            let pix = i16x8_extend_low_u8x16(source);
+            sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk));
+            let pix = i16x8_extend_high_u8x16(source);
+            sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk));
+
+            let source =
+                i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+                    source1, source2,
+                );
+            let pix = i16x8_extend_low_u8x16(source);
+            sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk));
+            let pix = i16x8_extend_high_u8x16(source);
+            sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk));
+
+            let source1 = wasm32_utils::load_v128(components1, src_x + 16); // top line
+            let source2 = wasm32_utils::load_v128(components2, src_x + 16); // bottom line
+
+            let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+                source1, source2,
+            );
+            let pix = i16x8_extend_low_u8x16(source);
+            sss4 = i32x4_add(sss4, i32x4_dot_i16x8(pix, mmk));
+            let pix = i16x8_extend_high_u8x16(source);
+            sss5 = i32x4_add(sss5, i32x4_dot_i16x8(pix, mmk));
+
+            let source =
+                i8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
+                    source1, source2,
+                );
+            let pix = i16x8_extend_low_u8x16(source);
+            sss6 = i32x4_add(sss6, i32x4_dot_i16x8(pix, mmk));
+            let pix = i16x8_extend_high_u8x16(source);
+            sss7 = i32x4_add(sss7, i32x4_dot_i16x8(pix, mmk));
+
+            y += 2;
+        }
+
+        if let Some(&k) = coeffs.get(y as usize) {
+            let s_row = src_img.get_row(y_start + y).unwrap();
+            let components = T::components(s_row);
+            let mmk = i32x4_splat(k as i32);
+
+            let source1 = wasm32_utils::load_v128(components, src_x); // top line
+
+            let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+                source1, ZERO,
+            );
+            let pix = i16x8_extend_low_u8x16(source);
+            sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk));
+            let pix = i16x8_extend_high_u8x16(source);
+            sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk));
+
+            let source = i16x8_extend_high_u8x16(source1);
+            let pix = i16x8_extend_low_u8x16(source);
+            sss2 = i32x4_add(sss2, i32x4_dot_i16x8(pix, mmk));
+            let pix = i16x8_extend_high_u8x16(source);
+            sss3 = i32x4_add(sss3, i32x4_dot_i16x8(pix, mmk));
+
+            let source1 = wasm32_utils::load_v128(components, src_x + 16); // top line
+
+            let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+                source1, ZERO,
+            );
+            let pix = i16x8_extend_low_u8x16(source);
+            sss4 = i32x4_add(sss4, i32x4_dot_i16x8(pix, mmk));
+            let pix = i16x8_extend_high_u8x16(source);
+            sss5 = i32x4_add(sss5, i32x4_dot_i16x8(pix, mmk));
+
+            let source = i16x8_extend_high_u8x16(source1);
+            let pix = i16x8_extend_low_u8x16(source);
+            sss6 = i32x4_add(sss6, i32x4_dot_i16x8(pix, mmk));
+            let pix = i16x8_extend_high_u8x16(source);
+            sss7 = i32x4_add(sss7, i32x4_dot_i16x8(pix, mmk));
+        }
+
+        macro_rules! call {
+            ($imm8:expr) => {{
+                sss0 = i32x4_shr(sss0, $imm8);
+                sss1 = i32x4_shr(sss1, $imm8);
+                sss2 = i32x4_shr(sss2, $imm8);
+                sss3 = i32x4_shr(sss3, $imm8);
+                sss4 = i32x4_shr(sss4, $imm8);
+                sss5 = i32x4_shr(sss5, $imm8);
+                sss6 = i32x4_shr(sss6, $imm8);
+                sss7 = i32x4_shr(sss7, $imm8);
+            }};
+        }
+        constify_imm8!(precision, call);
+
+        sss0 = i16x8_narrow_i32x4(sss0, sss1);
+        sss2 = i16x8_narrow_i32x4(sss2, sss3);
+        sss0 = u8x16_narrow_i16x8(sss0, sss2);
+        let dst_ptr = dst_chunk.as_mut_ptr() as *mut v128;
+        v128_store(dst_ptr, sss0);
+        sss4 = i16x8_narrow_i32x4(sss4, sss5);
+        sss6 = i16x8_narrow_i32x4(sss6, sss7);
+        sss4 = u8x16_narrow_i16x8(sss4, sss6);
+        let dst_ptr = dst_ptr.add(1);
+        v128_store(dst_ptr, sss4);
+
+        src_x += 32;
+    }
+
+    dst_u8 = dst_chunks_32.into_remainder();
+    let mut dst_chunks_8 = dst_u8.chunks_exact_mut(8);
+    for dst_chunk in &mut dst_chunks_8 {
+        let mut sss0 = initial; // left row
+        let mut sss1 = initial; // right row
+        let mut y: u32 = 0;
+
+        for src_rows in src_img.iter_2_rows(y_start, max_y) {
+            let components1 = T::components(src_rows[0]);
+            let components2 = T::components(src_rows[1]);
+            // Load two coefficients at once
+            let mmk = wasm32_utils::ptr_i16_to_set1_i32(coeffs, y as usize);
+
+            let source1 = wasm32_utils::loadl_i64(components1, src_x); // top line
+            let source2 = wasm32_utils::loadl_i64(components2, src_x); // bottom line
+
+            let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+                source1, source2,
+            );
+            let pix = i16x8_extend_low_u8x16(source);
+            sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk));
+            let pix = i16x8_extend_high_u8x16(source);
+            sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk));
+
+            y += 2;
+        }
+
+        if let Some(&k) = coeffs.get(y as usize) {
+            let s_row = src_img.get_row(y_start + y).unwrap();
+            let components = T::components(s_row);
+            let mmk = i32x4_splat(k as i32);
+
+            let source1 = wasm32_utils::loadl_i64(components, src_x); // top line
+
+            let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+                source1, ZERO,
+            );
+            let pix = i16x8_extend_low_u8x16(source);
+            sss0 = i32x4_add(sss0, i32x4_dot_i16x8(pix, mmk));
+            let pix = i16x8_extend_high_u8x16(source);
+            sss1 = i32x4_add(sss1, i32x4_dot_i16x8(pix, mmk));
+        }
+
+        macro_rules! call {
+            ($imm8:expr) => {{
+                sss0 = i32x4_shr(sss0, $imm8);
+                sss1 = i32x4_shr(sss1, $imm8);
+            }};
+        }
+        constify_imm8!(precision, call);
+
+        sss0 = i16x8_narrow_i32x4(sss0, sss1);
+        sss0 = u8x16_narrow_i16x8(sss0, sss0);
+        let dst_ptr = dst_chunk.as_mut_ptr() as *mut [i64; 2];
+        (*dst_ptr)[0] = i64x2_extract_lane::<0>(sss0);
+
+        src_x += 8;
+    }
+
+    dst_u8 = dst_chunks_8.into_remainder();
+    let mut dst_chunks_4 = dst_u8.chunks_exact_mut(4);
+    if let Some(dst_chunk) = dst_chunks_4.next() {
+        let mut sss = initial;
+        let mut y: u32 = 0;
+
+        for src_rows in src_img.iter_2_rows(y_start, max_y) {
+            let components1 = T::components(src_rows[0]);
+            let components2 = T::components(src_rows[1]);
+            // Load two coefficients at once
+            let mmk = wasm32_utils::ptr_i16_to_set1_i32(coeffs, y as usize);
+
+            let source1 = wasm32_utils::i32x4_v128_from_u8(components1, src_x); // top line
+            let source2 = wasm32_utils::i32x4_v128_from_u8(components2, src_x); // bottom line
+
+            let source = i8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(
+                source1, source2,
+            );
+            let pix = i16x8_extend_low_u8x16(source);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+
+            y += 2;
+        }
+
+        if let Some(&k) = coeffs.get(y as usize) {
+            let s_row = src_img.get_row(y_start + y).unwrap();
+            let components = T::components(s_row);
+            let pix = wasm32_utils::i32x4_extend_low_ptr_u8(components, src_x);
+            let mmk = i32x4_splat(k as i32);
+            sss = i32x4_add(sss, i32x4_dot_i16x8(pix, mmk));
+        }
+
+        macro_rules! call {
+            ($imm8:expr) => {{
+                sss = i32x4_shr(sss, $imm8);
+            }};
+        }
+        constify_imm8!(precision, call);
+
+        sss = i16x8_narrow_i32x4(sss, sss);
+        let dst_ptr = dst_chunk.as_mut_ptr() as *mut i32;
+        *dst_ptr = i32x4_extract_lane::<0>(u8x16_narrow_i16x8(sss, sss));
+
+        src_x += 4;
+    }
+
+    dst_u8 = dst_chunks_4.into_remainder();
+    if !dst_u8.is_empty() {
+        native::convolution_by_u8(
+            src_img,
+            normalizer,
+            1 << (precision - 1),
+            dst_u8,
+            src_x,
+            y_start,
+            coeffs,
+        );
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 4563ab8..e0e3278 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -34,3 +34,5 @@ mod resizer;
 mod simd_utils;
 #[cfg(feature = "for_test")]
 pub mod testing;
+#[cfg(target_arch = "wasm32")]
+mod wasm32_utils;
diff --git a/src/resizer.rs b/src/resizer.rs
index 549d735..068e3b2 100644
--- a/src/resizer.rs
+++ b/src/resizer.rs
@@ -16,6 +16,8 @@ pub enum CpuExtensions {
     Avx2,
     #[cfg(target_arch = "aarch64")]
     Neon,
+    #[cfg(target_arch = "wasm32")]
+    Wasm32,
 }
 
 impl CpuExtensions {
@@ -28,6 +30,8 @@ impl CpuExtensions {
             Self::Sse4_1 => is_x86_feature_detected!("sse4.1"),
             #[cfg(target_arch = "aarch64")]
             Self::Neon => true,
+            #[cfg(target_arch = "wasm32")]
+            Self::Wasm32 => true,
             Self::None => true,
         }
     }
@@ -54,8 +58,16 @@ impl Default for CpuExtensions {
             Self::None
         }
     }
+    #[cfg(target_arch = "wasm32")]
+    fn default() -> Self {
+        Self::Wasm32
+    }
 
-    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
+    #[cfg(not(any(
+        target_arch = "x86_64",
+        target_arch = "aarch64",
+        target_arch = "wasm32"
+    )))]
     fn default() -> Self {
         Self::None
     }
diff --git a/src/wasm32_utils.rs b/src/wasm32_utils.rs
new file mode 100644
index 0000000..afb5e2e
--- /dev/null
+++ b/src/wasm32_utils.rs
@@ -0,0 +1,73 @@
+use crate::pixels::{U8x3, U8x4};
+use std::arch::wasm32::*;
+use std::intrinsics::transmute;
+
+#[inline(always)]
+pub unsafe fn load_v128<T>(buf: &[T], index: usize) -> v128 {
+    v128_load(buf.get_unchecked(index..).as_ptr() as *const v128)
+}
+
+#[inline(always)]
+pub unsafe fn loadl_i64<T>(buf: &[T], index: usize) -> v128 {
+    let i = buf.get_unchecked(index..).as_ptr() as *const i64;
+    i64x2(*i, 0)
+}
+
+#[inline(always)]
+pub unsafe fn loadl_i32<T>(buf: &[T], index: usize) -> v128 {
+    let i = buf.get_unchecked(index..).as_ptr() as *const i32;
+    i32x4(*i, 0, 0, 0)
+}
+
+#[inline(always)]
+pub unsafe fn loadl_i16<T>(buf: &[T], index: usize) -> v128 {
+    let i = buf.get_unchecked(index..).as_ptr() as *const i16;
+    i16x8(*i, 0, 0, 0, 0, 0, 0, 0)
+}
+
+#[inline(always)]
+pub unsafe fn ptr_i16_to_set1_i64(buf: &[i16], index: usize) -> v128 {
+    i64x2_splat(*(buf.get_unchecked(index..).as_ptr() as *const i64))
+}
+
+#[inline(always)]
+pub unsafe fn ptr_i16_to_set1_i32(buf: &[i16], index: usize) -> v128 {
+    i32x4_splat(*(buf.get_unchecked(index..).as_ptr() as *const i32))
+}
+
+#[inline(always)]
+pub unsafe fn i32x4_extend_low_ptr_u8(buf: &[u8], index: usize) -> v128 {
+    let ptr = buf.get_unchecked(index..).as_ptr() as *const v128;
+    u32x4_extend_low_u16x8(i16x8_extend_low_u8x16(v128_load(ptr)))
+}
+
+#[inline(always)]
+pub unsafe fn i32x4_extend_low_ptr_u8x4(buf: &[U8x4], index: usize) -> v128 {
+    let v: u32 = transmute(buf.get_unchecked(index).0);
+    u32x4_extend_low_u16x8(i16x8_extend_low_u8x16(u32x4(v, 0, 0, 0)))
+}
+
+#[inline(always)]
+pub unsafe fn i32x4_extend_low_ptr_u8x3(buf: &[U8x3], index: usize) -> v128 {
+    let pixel = buf.get_unchecked(index).0;
+    i32x4(pixel[0] as i32, pixel[1] as i32, pixel[2] as i32, 0)
+}
+
+#[inline(always)]
+pub unsafe fn i32x4_v128_from_u8(buf: &[u8], index: usize) -> v128 {
+    let ptr = buf.get_unchecked(index..).as_ptr() as *const i32;
+    i32x4(*ptr, 0, 0, 0)
+}
+
+#[inline(always)]
+pub unsafe fn u16x8_mul_hi(a: v128, b: v128) -> v128 {
+    let lo = u32x4_extmul_low_u16x8(a, b);
+    let hi = u32x4_extmul_high_u16x8(a, b);
+    i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(lo, hi)
+}
+
+#[inline(always)]
+pub unsafe fn i64x2_mul_lo(a: v128, b: v128) -> v128 {
+    const SHUFFLE: v128 = i8x16(0, 1, 2, 3, 8, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1);
+    i64x2_extmul_low_i32x4(i8x16_swizzle(a, SHUFFLE), i8x16_swizzle(b, SHUFFLE))
+}
diff --git a/testing/src/lib.rs b/testing/src/lib.rs
index b70976b..7ad204e 100644
--- a/testing/src/lib.rs
+++ b/testing/src/lib.rs
@@ -272,5 +272,7 @@ pub fn cpu_ext_into_str(cpu_extensions: CpuExtensions) -> &'static str {
         CpuExtensions::Avx2 => "avx2",
         #[cfg(target_arch = "aarch64")]
         CpuExtensions::Neon => "neon",
+        #[cfg(target_arch = "wasm32")]
+        CpuExtensions::Wasm32 => "wasm32",
     }
 }
diff --git a/tests/alpha_tests.rs b/tests/alpha_tests.rs
index 1464df9..81348ea 100644
--- a/tests/alpha_tests.rs
+++ b/tests/alpha_tests.rs
@@ -154,6 +154,12 @@ mod multiply_alpha_u8x4 {
         mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon);
     }
 
+    #[cfg(target_arch = "wasm32")]
+    #[test]
+    fn wasm32_test() {
+        mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32);
+    }
+
     #[test]
     fn native_test() {
         mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::None);
@@ -206,6 +212,12 @@ mod multiply_alpha_u8x2 {
         mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon);
     }
 
+    #[cfg(target_arch = "wasm32")]
+    #[test]
+    fn wasm32_test() {
+        mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32);
+    }
+
     #[test]
     fn native_test() {
         mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::None);
@@ -258,6 +270,12 @@ mod multiply_alpha_u16x2 {
         mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon);
     }
 
+    #[cfg(target_arch = "wasm32")]
+    #[test]
+    fn wasm32_test() {
+        mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32);
+    }
+
     #[test]
     fn native_test() {
         mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::None);
@@ -298,6 +316,12 @@ mod multiply_alpha_u16x4 {
         mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon);
     }
 
+    #[cfg(target_arch = "wasm32")]
+    #[test]
+    fn wasm32_test() {
+        mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32);
+    }
+
     #[test]
     fn native_test() {
         mul_div_alpha_test(Oper::Mul, SRC_PIXELS, RES_PIXELS, CpuExtensions::None);
@@ -336,6 +360,12 @@ mod divide_alpha_u8x4 {
         mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon);
     }
 
+    #[cfg(target_arch = "wasm32")]
+    #[test]
+    fn wasm32_test() {
+        mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32);
+    }
+
     #[test]
     fn native_test() {
         mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::None);
@@ -388,6 +418,12 @@ mod divide_alpha_u8x2 {
         mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Neon);
     }
 
+    #[cfg(target_arch = "wasm32")]
+    #[test]
+    fn wasm32_test() {
+        mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32);
+    }
+
     #[test]
     fn native_test() {
         mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::None);
@@ -451,6 +487,12 @@ mod divide_alpha_u16x2 {
         mul_div_alpha_test(OPER, SRC_PIXELS, SIMD_RES_PIXELS, CpuExtensions::Neon);
     }
 
+    #[cfg(target_arch = "wasm32")]
+    #[test]
+    fn wasm32_test() {
+        mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32);
+    }
+
     #[test]
     fn native_test() {
         mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::None);
@@ -496,6 +538,12 @@ mod divide_alpha_u16x4 {
         mul_div_alpha_test(OPER, SRC_PIXELS, SIMD_RES_PIXELS, CpuExtensions::Neon);
     }
 
+    #[cfg(target_arch = "wasm32")]
+    #[test]
+    fn wasm32_test() {
+        mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::Wasm32);
+    }
+
     #[test]
     fn native_test() {
         mul_div_alpha_test(OPER, SRC_PIXELS, RES_PIXELS, CpuExtensions::None);
diff --git a/tests/resize_tests.rs b/tests/resize_tests.rs
index 4783bcd..7f7cca1 100644
--- a/tests/resize_tests.rs
+++ b/tests/resize_tests.rs
@@ -203,6 +203,10 @@ fn resize_to_same_width_after_cropping() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         if !cpu_extensions.is_supported() {
             continue;
@@ -376,6 +380,10 @@ fn downscale_u8() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::downscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -400,6 +408,10 @@ fn upscale_u8() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::upscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -424,6 +436,10 @@ fn downscale_u8x2() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::downscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -452,6 +468,10 @@ fn upscale_u8x2() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::upscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -480,6 +500,10 @@ fn downscale_u8x3() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::downscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -508,6 +532,10 @@ fn upscale_u8x3() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::upscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -536,6 +564,10 @@ fn downscale_u8x4() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::downscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -570,6 +602,10 @@ fn upscale_u8x4() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::upscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -594,6 +630,10 @@ fn downscale_u16() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::downscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -618,6 +658,10 @@ fn upscale_u16() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::upscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -646,6 +690,10 @@ fn downscale_u16x2() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::downscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -674,6 +722,10 @@ fn upscale_u16x2() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::upscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -702,6 +754,10 @@ fn downscale_u16x3() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::downscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -730,6 +786,10 @@ fn upscale_u16x3() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::upscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -758,6 +818,10 @@ fn downscale_u16x4() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::downscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),
@@ -786,6 +850,10 @@ fn upscale_u16x4() {
     {
         cpu_extensions_vec.push(CpuExtensions::Neon);
     }
+    #[cfg(target_arch = "wasm32")]
+    {
+        cpu_extensions_vec.push(CpuExtensions::Wasm32);
+    }
     for cpu_extensions in cpu_extensions_vec {
         P::upscale_test(
             ResizeAlg::Convolution(FilterType::Lanczos3),